A simple example about full-text search based Java:Lucene

2022-07-15 80酷酷网 80kuku.com

索引源代码：
package lucene;

/**
* Title: 
* Description: 
* Copyright: Copyright (c) 2003
* Company: 
* author Shirley
* version 1.0
*/

import org.apache.lucene.index.*;
import org.apache.lucene.analysis.*;
import java.io.*;
import org.apache.lucene.document.*;

public class IndexFiles {
 //使用方法：: IndexFiles [索引输出目录] [索引的文件列表] ...
 public static void main(String[] arg) throws Exception {
 String[] args = new String[2];
 //索引后存放索引信息的路径
 args[0] = System.getProperty("java.io.tmpdir", "tmp") + System.getProperty("file.separator") + "index-1";
 //待索引文件
 args[1] = "E:\\AppWork\\lucene\\rfc2047.txt";
 args[2] = "E:\\AppWork\\cyberoffice\\CO\\Sheldon Java Mail.htm";
 args[3] = "E:\\AppWork\\lucene\\englishtest.doc";
 args[4] = "E:\\AppWork\\cyberoffice\\CO\\xls1.xls";
 args[5] = "E:\\AppWork\\cyberoffice\\CO\\ppt1.ppt";

 String indexPath = args[0];
 IndexWriter writer;
 //用指定的语言分析器构造一个新的写索引器（第3个参数表示是否为追加索引）
 writer = new IndexWriter(indexPath, new SimpleAnalyzer(), false);

 for (int i=1; i<args.length; i++) {
 System.out.println("Indexing file " + args[i]);
 InputStream is = new FileInputStream(args[i]);

 //构造包含2个字段Field的Document对象
 //一个是路径path字段，不索引，只存储
 //一个是内容body字段，进行全文索引，并存储
 Document doc = new Document();
 doc.add(Field.UnIndexed("path", args[i]));
 doc.add(Field.Text("body", (Reader) new InputStreamReader(is)));
 //将文档写入索引
 writer.addDocument(doc);
 is.close();
 };
 //关闭写索引器
 writer.close();
 }
}

搜索源代码：
package lucene;

/**
* Title: 
* Description: 
* Copyright: Copyright (c) 2003
* Company: 
* author Shirley
* version 1.0
*/

import org.apache.lucene.search.*;
import org.apache.lucene.queryParser.*;
import org.apache.lucene.analysis.*;

public class Search {
 public static void main(String[] arg) throws Exception {
 String[] args = new String[2];
 //索引后存放索引信息的路径
 args[0] = System.getProperty("java.io.tmpdir", "tmp") + System.getProperty("file.separator") + "index-1";
 //搜索關鍵字
 args[1] = "sending";

 String indexPath = args[0];
 String queryString = args[1];

 //指向索引目录的搜索器
 Searcher searcher = new IndexSearcher(indexPath);
 //查询解析器：使用和索引同样的语言分析器
 Query query = QueryParser.parse(queryString, "body", new SimpleAnalyzer());
 //搜索结果使用Hits存储
 Hits hits = searcher.search(query);
 //通过hits可以访问到相应字段的数据和查询的匹配度
 for (int i=0; i<hits.length(); i++) {
 System.out.println(hits.doc(i).get("path") + "; Score: " + hits.score(i));
 };
 }
}

注：目前程序只支持英文索引，可以过滤文件类型为.txt .doc .htm .xls .ppt

中文索引及其它类型文件的索引正在研究中......