1、准备工作
下载lucene 3.6.1 : http://lucene.apache.org/
下载中文分词IK Analyzer: http://code.google.com/p/ik-analyzer/downloads/list (注意下载的是IK Analyzer 2012_u5_source.zip,其他版本有bug)
下载solr 3.6.1: http://lucene.apache.org/solr/(编译IK Analyzer时需引用包)
OK,将lucene 、solr 相关包(lucene-core-3.6.1.jar、lucene-highlighter-3.6.1.jar、lucene-analyzers-3.6.1.jar、apache-solr-core-3.6.1.jar、apache-solr-solrj-3.6.1.jar)拷贝到项目lib下,IK源码置于项目src下。
2、从Oracle数据库中取数据创建索引(使用IK分词)
package lucene.util;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.sql.Connection;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import modules.gk.Gk_info;
import modules.gk.Gk_infoSub;
import web.sys.Globals;
import web.db.DBConnector;
import web.db.ObjectCtl;
import web.util.StringUtil;
//Wizzer.cn
public class LuceneIndex {
IndexWriter writer = null;
FSDirectory dir = null;
boolean create = true;//是否初始化&覆盖索引库
public void init() {
long a1 = System.currentTimeMillis();
System.out.println("[Lucene 开始执行:" + new Date() + "]");
Connection con = DBConnector.getconecttion(); //取得一个数据库连接
try {
final File docDir = new File(Globals.SYS_COM_CONFIG.get("sys.index.path").toString());//E:\lucene
if (!docDir.exists()) {
docDir.mkdirs();
}
String cr = Globals.SYS_COM_CONFIG.get("sys.index.create").toString();//true or false
if ("false".equals(cr.toLowerCase())) {
create = false;
}
dir = FSDirectory.open(docDir);
// Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
Analyzer analyzer = new IKAnalyzer(true);
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer);
if (create) {
// Create a new index in the directory, removing any
// previously indexed documents:
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
} else {
// Add new documents to an existing index:
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
}
writer = new IndexWriter(dir, iwc);
String sql = "SELECT indexno,title,describes,pdate,keywords FROM TABLEA WHERE STATE=1 AND SSTAG<>1 ";
int rowCount = ObjectCtl.getRowCount(con, sql);
int pageSize = StringUtil.StringToInt(Globals.SYS_COM_CONFIG.get("sys.index.size").toString()); //每页记录数
int pages = (rowCount - 1) / pageSize + 1; //计算总页数
ArrayList list = null;
Gk_infoSub gk = null;
for (int i = 1; i < pages+1; i++) {
long a = System.currentTimeMillis();
list = ObjectCtl.listPage(con, sql, i, pageSize, new Gk_infoSub());
for (int j = 0; j < list.size(); j++) {
gk = (Gk_infoSub) list.get(j);
Document doc = new Document();
doc.add(new Field("indexno", StringUtil.null2String(gk.getIndexno()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//主键不分词
doc.add(new Field("title", StringUtil.null2String(gk.getTitle()), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("describes", StringUtil.null2String(gk.getDescribes()), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("pdate", StringUtil.null2String(gk.getPdate()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//日期不分词
doc.add(new Field("keywords", StringUtil.null2String(gk.getKeywords()), Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc);
ObjectCtl.executeUpdateBySql(con,"UPDATE TABLEA SET SSTAG=1 WHERE indexno='"+gk.getIndexno()+"'");//更新已索引状态
}
long b = System.currentTimeMillis();
long c = b - a;
System.out.println("[Lucene " + rowCount + "条," + pages + "页,第" + i + "页花费时间:" + c + "毫秒]");
}
writer.commit();
} catch (Exception e) {
e.printStackTrace();
} finally {
DBConnector.freecon(con); //释放数据库连接
try {
if (writer != null) {
writer.close();
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (dir != null && IndexWriter.isLocked(dir)) {
IndexWriter.unlock(dir);//注意解锁
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
long b1 = System.currentTimeMillis();
long c1 = b1 - a1;
System.out.println("[Lucene 执行完毕,花费时间:" + c1 + "毫秒,完成时间:" + new Date() + "]");
}
}
3、单字段查询以及多字段分页查询高亮显示
package lucene.util;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.search.*;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.Version;
import modules.gk.Gk_infoSub;
import java.util.ArrayList;
import java.io.File;
import java.io.StringReader;
import java.lang.reflect.Constructor;
import web.util.StringUtil;
import web.sys.Globals;
import org.wltea.analyzer.lucene.IKAnalyzer;
//Wizzer.cn
public class LuceneQuery {
private static String indexPath;// 索引生成的目录
private int rowCount;// 记录数
private int pages;// 总页数
private int currentPage;// 当前页数
private int pageSize; //每页记录数
public LuceneQuery() {
this.indexPath = Globals.SYS_COM_CONFIG.get("sys.index.path").toString();
}
public int getRowCount() {
return rowCount;
}
public int getPages() {
return pages;
}
public int getPageSize() {
return pageSize;
}
public int getCurrentPage() {
return currentPage;
}
/**
* 函数功能:根据字段查询索引
*/
public ArrayList queryIndexTitle(String keyWord, int curpage, int pageSize) {
ArrayList list = new ArrayList();
try {
if (curpage <= 0) {
curpage = 1;
}
if (pageSize <= 0) {
pageSize = 20;
}
this.pageSize = pageSize; //每页记录数
this.currentPage = curpage; //当前页
int start = (curpage - 1) * pageSize;
Directory dir = FSDirectory.open(new File(indexPath));
IndexReader reader = IndexReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new IKAnalyzer(true);
QueryParser queryParser = new QueryParser(Version.LUCENE_36, "title", analyzer);
queryParser.setDefaultOperator(QueryParser.AND_OPERATOR);
Query query = queryParser.parse(keyWord);
int hm = start + pageSize;
TopScoreDocCollector res = TopScoreDocCollector.create(hm, false);
searcher.search(query, res);
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
this.rowCount = res.getTotalHits();
this.pages = (rowCount - 1) / pageSize + 1; //计算总页数
TopDocs tds = res.topDocs(start, pageSize);
ScoreDoc[] sd = tds.scoreDocs;
for (int i = 0; i < sd.length; i++) {
Document hitDoc = reader.document(sd[i].doc);
list.add(createObj(hitDoc, analyzer, highlighter));
}
} catch (Exception e) {
e.printStackTrace();
}
return list;
}
/**
* 函数功能:根据字段查询索引
*/
public ArrayList queryIndexFields(String allkeyword, String onekeyword, String nokeyword, int curpage, int pageSize) {
ArrayList list = new ArrayList();
try {
if (curpage <= 0) {
curpage = 1;
}
if (pageSize <= 0) {
pageSize = 20;
}
this.pageSize = pageSize; //每页记录数
this.currentPage = curpage; //当前页
int start = (curpage - 1) * pageSize;
Directory dir = FSDirectory.open(new File(indexPath));
IndexReader reader = IndexReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
BooleanQuery bQuery = new BooleanQuery(); //组合查询
if (!"".equals(allkeyword)) {//包含全部关键词
KeywordAnalyzer analyzer = new KeywordAnalyzer();
BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//AND
Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, allkeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer);
bQuery.add(query, BooleanClause.Occur.MUST); //AND
}
if (!"".equals(onekeyword)) { //包含任意关键词
Analyzer analyzer = new IKAnalyzer(true);
BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//OR
Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, onekeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer);
bQuery.add(query, BooleanClause.Occur.MUST); //AND
}
if (!"".equals(nokeyword)) { //排除关键词
Analyzer analyzer = new IKAnalyzer(true);
BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//NOT
Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, nokeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer);
bQuery.add(query, BooleanClause.Occur.MUST_NOT); //AND
}
int hm = start + pageSize;
TopScoreDocCollector res = TopScoreDocCollector.create(hm, false);
searcher.search(bQuery, res);
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(bQuery));
this.rowCount = res.getTotalHits();
this.pages = (rowCount - 1) / pageSize + 1; //计算总页数
System.out.println("rowCount:" + rowCount);
TopDocs tds = res.topDocs(start, pageSize);
ScoreDoc[] sd = tds.scoreDocs;
Analyzer analyzer = new IKAnalyzer();
for (int i = 0; i < sd.length; i++) {
Document hitDoc = reader.document(sd[i].doc);
list.add(createObj(hitDoc, analyzer, highlighter));
}
} catch (Exception e) {
e.printStackTrace();
}
return list;
}
/**
* 创建返回对象(高亮)
*/
private synchronized static Object createObj(Document doc, Analyzer analyzer, Highlighter highlighter) {
Gk_infoSub gk = new Gk_infoSub();
try {
if (doc != null) {
gk.setIndexno(StringUtil.null2String(doc.get("indexno")));
gk.setPdate(StringUtil.null2String(doc.get("pdate")));
String title = StringUtil.null2String(doc.get("title"));
gk.setTitle(title);
if (!"".equals(title)) {
highlighter.setTextFragmenter(new SimpleFragmenter(title.length()));
TokenStream tk = analyzer.tokenStream("title", new StringReader(title));
String htext = StringUtil.null2String(highlighter.getBestFragment(tk, title));
if (!"".equals(htext)) {
gk.setTitle(htext);
}
}
String keywords = StringUtil.null2String(doc.get("keywords"));
gk.setKeywords(keywords);
if (!"".equals(keywords)) {
highlighter.setTextFragmenter(new SimpleFragmenter(keywords.length()));
TokenStream tk = analyzer.tokenStream("keywords", new StringReader(keywords));
String htext = StringUtil.null2String(highlighter.getBestFragment(tk, keywords));
if (!"".equals(htext)) {
gk.setKeywords(htext);
}
}
String describes = StringUtil.null2String(doc.get("describes"));
gk.setDescribes(describes);
if (!"".equals(describes)) {
highlighter.setTextFragmenter(new SimpleFragmenter(describes.length()));
TokenStream tk = analyzer.tokenStream("keywords", new StringReader(describes));
String htext = StringUtil.null2String(highlighter.getBestFragment(tk, describes));
if (!"".equals(htext)) {
gk.setDescribes(htext);
}
}
}
return gk;
}
catch (Exception e) {
e.printStackTrace();
return null;
}
finally {
gk = null;
}
}
private synchronized static Object createObj(Document doc) {
Gk_infoSub gk = new Gk_infoSub();
try {
if (doc != null) {
gk.setIndexno(StringUtil.null2String(doc.get("indexno")));
gk.setPdate(StringUtil.null2String(doc.get("pdate")));
gk.setTitle(StringUtil.null2String(doc.get("title")));
gk.setKeywords(StringUtil.null2String(doc.get("keywords")));
gk.setDescribes(StringUtil.null2String(doc.get("describes")));
}
return gk;
}
catch (Exception e) {
e.printStackTrace();
return null;
}
finally {
gk = null;
}
}
}
单字段查询:
long a = System.currentTimeMillis();
try {
int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get("curpage")));
int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get("pagesize")));
String title = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("title")));
LuceneQuery lu = new LuceneQuery();
form.addResult("list", lu.queryIndexTitle(title, curpage, pagesize));
form.addResult("curPage", lu.getCurrentPage());
form.addResult("pageSize", lu.getPageSize());
form.addResult("rowCount", lu.getRowCount());
form.addResult("pageCount", lu.getPages());
} catch (Exception e) {
e.printStackTrace();
}
long b = System.currentTimeMillis();
long c = b - a;
System.out.println("[搜索信息花费时间:" + c + "毫秒]");
多字段查询:
long a = System.currentTimeMillis();
try {
int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get("curpage")));
int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get("pagesize")));
String allkeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("allkeyword")));
String onekeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("onekeyword")));
String nokeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("nokeyword")));
LuceneQuery lu = new LuceneQuery();
form.addResult("list", lu.queryIndexFields(allkeyword,onekeyword,nokeyword, curpage, pagesize));
form.addResult("curPage", lu.getCurrentPage());
form.addResult("pageSize", lu.getPageSize());
form.addResult("rowCount", lu.getRowCount());
form.addResult("pageCount", lu.getPages());
} catch (Exception e) {
e.printStackTrace();
}
long b = System.currentTimeMillis();
long c = b - a;
System.out.println("[高级检索花费时间:" + c + "毫秒]");
4、Lucene通配符查询
BooleanQuery bQuery = new BooleanQuery(); //组合查询
if (!"".equals(title)) {
WildcardQuery w1 = new WildcardQuery(new Term("title", title+ "*"));
bQuery.add(w1, BooleanClause.Occur.MUST); //AND
}
int hm = start + pageSize;
TopScoreDocCollector res = TopScoreDocCollector.create(hm, false);
searcher.search(bQuery, res);
5、Lucene嵌套查询
实现SQL:(unitid like ‘unitid%’ and idml like ‘id2%’) or (tounitid like ‘unitid%’ and tomlid like ‘id2%’ and tostate=1)
BooleanQuery bQuery = new BooleanQuery();
BooleanQuery b1 = new BooleanQuery();
WildcardQuery w1 = new WildcardQuery(new Term("unitid", unitid + "*"));
WildcardQuery w2 = new WildcardQuery(new Term("idml", id2 + "*"));
b1.add(w1, BooleanClause.Occur.MUST);//AND
b1.add(w2, BooleanClause.Occur.MUST);//AND
bQuery.add(b1, BooleanClause.Occur.SHOULD);//OR
BooleanQuery b2 = new BooleanQuery();
WildcardQuery w3 = new WildcardQuery(new Term("tounitid", unitid + "*"));
WildcardQuery w4 = new WildcardQuery(new Term("tomlid", id2 + "*"));
WildcardQuery w5 = new WildcardQuery(new Term("tostate", "1"));
b2.add(w3, BooleanClause.Occur.MUST);//AND
b2.add(w4, BooleanClause.Occur.MUST);//AND
b2.add(w5, BooleanClause.Occur.MUST);//AND
bQuery.add(b2, BooleanClause.Occur.SHOULD);//OR
6、Lucene先根据时间排序后分页
下面这种方式不太合理,建议在创建索引库的时候排序,这样查询的时候只用分页即可,若有多个排序条件可单独创建索引库。
int hm = start + pageSize;
Sort sort = new Sort(new SortField(“pdate”, SortField.STRING, true));
TopScoreDocCollector res = TopScoreDocCollector.create(pageSize, false);
searcher.search(bQuery, res);
this.rowCount = res.getTotalHits();
this.pages = (rowCount – 1) / pageSize + 1; //计算总页数
TopDocs tds =searcher.search(bQuery,rowCount,sort);// res.topDocs(start, pageSize);
ScoreDoc[] sd = tds.scoreDocs;
System.out.println(“rowCount:” + rowCount);
int i=0;
for (ScoreDoc scoreDoc : sd) {
i++;
if(i<start){
continue;
}
if(i>hm){
break;
}
Document doc = searcher.doc(scoreDoc.doc);
list.add(createObj(doc));
}
最新的排序写法:
int hm = start + pageSize;
Sort sort = new Sort();
SortField sortField = new SortField("pdate", SortField.STRING, true);
sort.setSort(sortField);
TopDocs hits = searcher.search(bQuery, null, hm, sort);
this.rowCount = hits.totalHits;
this.pages = (rowCount - 1) / pageSize + 1; //计算总页数
for (int i = start; i < hits.scoreDocs.length; i++) {
ScoreDoc sdoc = hits.scoreDocs[i];
Document doc = searcher.doc(sdoc.doc);
list.add(createObj(doc));
}
ps:
周一完成创建索引库定时任务,周二实现模糊查询中文分词高亮显示及分页,今天实现了通配符查询、嵌套查询、先排序后分页,从零接触到实现Lucene主要功能花了三天时间,当然,性能如何还待测试和优化。