paoding-analysis-2.0.4自定义词典不行?

help 2011-02-15
paoding-analysis-2.0.4自定义词典不行?
------------------------------------
package com.m4g.bns.test;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

public class Test {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		// TODO Auto-generated method stub
		IndexWriter iwriter = null;
		IndexSearcher isearcher = null;
		Directory directory = null;
		// 实例化IKAnalyzer分词器
		Analyzer analyzer = new PaodingAnalyzer();
		try {
			// 建立内存索引对象
			directory = new RAMDirectory();
			Document doc1 = new Document();
			doc1.add(new Field("contents", "shanghai xx上海xooo4路易维尔国际机场 (SDF), 路易龟龙片甲维尔, 肯塔基州 40209, 美国", Field.Store.YES,
					Field.Index.TOKENIZED));
			doc1.setBoost(1.0f);
			iwriter = new IndexWriter(directory, analyzer, true);
			iwriter.addDocument(doc1);
			iwriter.close();

			IndexReader reader = IndexReader.open(directory);
			QueryParser parser = new  QueryParser(Version.LUCENE_CURRENT,"contents",  analyzer);
			Query query = parser.parse("塔基");//获取查询对象
			query = query.rewrite(reader);
			isearcher = new  IndexSearcher(directory,  true );   //创建索引搜索器   
			TopDocs ts = isearcher.search(query, null ,  100 );   //执行搜索,获取查询前100结果集对象   
			ScoreDoc[] hits = ts.scoreDocs;  //获取命中的文档信息对象    
			for  ( int  j =  0 ; j < hits.length; j++) {   
//			    Document hitDoc = isearcher.doc(hits[j].doc); //根据命中的文档的内部编号获取该文档   
				TermDocs docs = isearcher.getIndexReader().termDocs(new Term("contents","塔基"));
				while (docs.next()) {
					System.out.println("Term在文档中的出现次数" + docs.freq());
				}
			}
//			IndexReader reader = IndexReader.open("c:\\index1");
//			Term term1 = new Term("contents", "共产");
//			TermDocs docs = reader.termDocs(term1);
//			while (docs.next()) {
//				System.out.println("含有所查找的" + term1 + "的Document的编号为"
//						+ docs.doc());
//				System.out.println("Term在文档中的出现次数" + docs.freq());
//			}
			
			reader.close();
			isearcher.close();
		} catch (Exception e) {
			// TODO: handle exception
			e.printStackTrace();
		}
	}
}


--------------------
我添加了一个新的字典xxxx.dic ,里面就有一个词“塔基”

37392872kiss 2011-02-16
请多查看些paoding分词的资料,尤其是原作者的blog,关于自定义词典等都是有讲解的,我用了一年的paoding分词,paoding的自己定义词典是很好用的,依稀记得是在配置文件中指定开启自定义词典,系统会定时自动扫描dic目录,加载词典过程中内存消耗会增加30%以上,这点需要注意
Linux_Unix 2011-06-30
你去参考下中科院的分词器吧,我觉得中科院的分词器还行!
冰火人 2012-10-08
中科院的那个也不错的,只是没有源码
Global site tag (gtag.js) - Google Analytics