近期做的Lucene+盘古分词总结( net)

首先给大家推荐一下我老师大神的人工智能教学网站。教学不仅零基础,通俗易懂,而且非常风趣幽默,还时不时有内涵黄段子!点这里可以跳转到网站

Lucene.Net只是一个全文检索开发包。它的功能就是提供了全文检索功能的一个数据库。Lucene.Net不管文本数据怎么来的,用户可以基于Lucene.Net开发满足自己需求的搜索引擎。Lucene.Net智能对
文本信息进行检索。如果不是文本信息,要转换为文本信息,比如检索Excel文件,就要用NPOI把Excel读取成字符串,然后把字符串扔给Lucene.Net。Lucene.Net会把扔给它的文本切词保存,加快检索

速度。
Lucene.Net中不同的分词算法就是不同的类,所有分词算法类都从Analyzer类继承。
庖丁解牛、盘古分词,IKAnalyzer分词(Java)等是基于词库的分词算法,可以提高分词成功率,但是效率低。

盘古分词使用方法:
将Dict文件夹放置项目根目录,并将其下文件”复制到输出目录”属性设置为”如果较新则复制”
引用PanGu.dll与PanGu.Lucene.Analyzer.dll类库即可使用盘古分词算法
添加引用:PanGu.HighLight.dll

我的代码示例用的是Lucene的2.9.2版本,我也在思考,欢迎大家给我意见,我用3.0以上的Lucene用不了盘古分词,期待更好的分词出现在.net的平台下。

//获取索引路径
        string indexPath = new LuceneAreaProvider().GetIndexPath();

        #region  分析器
        private Analyzer _analyzer = null;
        public Analyzer analyzer
        {
            get
            {
                _analyzer = new Lucene.Net.Analysis.PanGu.PanGuAnalyzer();
                return _analyzer;
            }
        }
        #endregion

        #region FSDirectory   directory_luce
        private Lucene.Net.Store.FSDirectory _directory_luce = null;

        public Lucene.Net.Store.FSDirectory directory_luce
        {
            get
            {
                if (_directory_luce == null)
                {
                    _directory_luce = Lucene.Net.Store.FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
                }

                return _directory_luce;
            }
        }
        #endregion

        #region 获取分词
        /// <summary>
        /// 获取分词
        /// </summary>
        /// <param name=”searchText”></param>
        /// <returns></returns>
        private List<string> GetSplitString(string searchText)
        {
            List<string> listResult = new List<string>();
            TokenStream tokenStream = analyzer.TokenStream(searchText, new StringReader(searchText));
            //Boolean hasNext = tokenStream.IncrementToken();
            ////Lucene.Net.Analysis.Tokenattributes.TermAttributeImpl ita;
            //while (hasNext)
            //{
            //    //ita = tokenStream.GetAttribute<Lucene.Net.Analysis.Tokenattributes.TermAttributeImpl>();

            //    //listResult.Add(tokenStream());
            //    hasNext = tokenStream.IncrementToken();
            //}

            Token token = tokenStream.Next();
            while (token != null)
            {
                listResult.Add(token.TermText());
                token = tokenStream.Next();
            }
            return listResult;
        }
        #endregion

        #region  创建索引
        /// <summary>
        /// 创建索引
        /// 连表查询所获取的数据  
        /// </summary>
        public void CreateIndex()
        {

           //这个是存在磁盘上,还有一种方式是存在内存中,这个得根据具体的情况而定

            FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory()); 
            //思考判断 字典是否存在并有数据
            //有的话做追加   没有就是添加所有的数据索引
            bool isUpdate = IndexReader.IndexExists(directory);
            if (isUpdate)
            {
                if (IndexWriter.IsLocked(directory))
                {
                    IndexWriter.Unlock(directory);
                }
            }
            //IndexWriter用于向索引库写内容
            //IndexWriter的第三个参数的解释:true表示删除之前的重新写入  false:表示追加
            //使用IndexWriter打开directory时会自动对索引库文件上锁  多人同时操作并发问题


            IndexWriter writer = new IndexWriter(directory, analyzer, !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
           //  getRegionData   为数据源,这个得根据个人的具体情况而定
            foreach (ApiRegionMappingEntity item in getRegionData)
            {
                Document document = new Document();
                document.Add(new Field(“id”, item.ApiRegionMapId.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));//NOT_ANALYZED–不分词
                document.Add(new Field(“regionID”, item.JxRegionId.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                document.Add(new Field(“ApiRegionId”, item.ApiRegionId.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                document.Add(new Field(“Platform”, item.Platform.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                document.Add(new Field(“name”, item.ApiRegionName.ToString(), Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
                writer.AddDocument(document);//将文档写入索引库
            }
            writer.Optimize(); //添加完后 合并
            writer.Close();
            directory.Close();//不要忘了Close,否则索引结果搜不到
        }
        #endregion

        #region 关键字搜索 集合
        /// <summary>
        /// 搜索
        /// </summary>
        /// <param name=”searchText”></param>
        /// <returns></returns>
        public List<ApiRegionMappingEntity> Search(string searchText)
        {
            //搜索关键字
            string kw = searchText;
            //索引路径
            //string indexPath = System.AppDomain.CurrentDomain.BaseDirectory + “\\lucenedir”;
            //FS是FileSystem的简写,它的父类是Directory,Directory表示索引文件保存的地方,它有两个子类FSDirectory、RAMDirectory。使用时别和IO里的Directory混了
            FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory());//打开索引库
            IndexReader reader = IndexReader.Open(directory, true);//IndexWriter用于从索引库读内容
            IndexSearcher searcher = new IndexSearcher(reader);//IndexSearcher用于搜索索引库
            //搜索条件
            BooleanQuery query = new BooleanQuery();
            foreach (string word in GetSplitString(searchText))
            {
                query.Add(new TermQuery(new Term(“name”, word)), BooleanClause.Occur.SHOULD);
            }
            //创建 盛放查询结果的容器
            TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);
            //根据条件 查询结果放入容器
            searcher.Search(query, collector);
            //获取所有的文档数据
            List<ApiRegionMappingEntity> regionMapingList = new List<ApiRegionMappingEntity>();
            ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs;
            for (int i = 0; i < docs.Length; i++)
            {
                int docId = docs[i].doc;//得到查询结果文档的id(Lucene内部分配的id)
                Document doc = searcher.Doc(docId);//找到文档id对应的文档详细信息
                ApiRegionMappingEntity mappingEntity = new ApiRegionMappingEntity();
                mappingEntity.ApiRegionMapId = Convert.ToInt32(doc.Get(“id”));
                mappingEntity.JxRegionId = Convert.ToInt32(doc.Get(“regionID”));
                mappingEntity.ApiRegionId = Convert.ToInt32(doc.Get(“ApiRegionId”));
                mappingEntity.Platform = Convert.ToInt32(doc.Get(“Platform”));
                mappingEntity.ApiRegionName = doc.Get(“name”);
                mappingEntity.ApiRegionId = Convert.ToInt32(doc.Get(“ApiRegionId”));
                regionMapingList.Add(mappingEntity);
            }
            return regionMapingList;
        }
        #endregion

        #region 关键词高亮显示
        /// <summary>
        /// 关键词高亮显示
        /// </summary>
        /// <param name=”keyword”></param>
        /// <param name=”content”></param>
        /// <returns></returns>
        private string Highlight(string keyword, string content)
        {
            //创建HTMLFormatter,参数为高亮单词的前后缀
            PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter =
                   new PanGu.HighLight.SimpleHTMLFormatter(“<font color=\”red\”><b>”, “</b></font>”);
            //创建 Highlighter ,输入HTMLFormatter 和 盘古分词对象Semgent
            PanGu.HighLight.Highlighter highlighter =
                            new PanGu.HighLight.Highlighter(simpleHTMLFormatter,
                            new Segment());
            //设置每个摘要段的字符数
            highlighter.FragmentSize = 50;
            //获取最匹配的摘要段
            return highlighter.GetBestFragment(keyword, content);
        }
        #endregion

        #region 删除索引
        /// <summary>
        /// 删除索引
        /// </summary>
        /// <param name=”field”>field</param>
        /// <param name=”value”>value</param>
        public void DeleteIndex(string field, string value)
        {
            FSDirectory directory = directory_luce;
            bool isUpdate = IndexReader.IndexExists(directory);
            if (isUpdate)
            {
                if (IndexWriter.IsLocked(directory))
                {
                    IndexWriter.Unlock(directory);
                }
            }
            IndexWriter writer = new IndexWriter(directory, analyzer, !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
            writer.DeleteDocuments(new Term(field, value));
            writer.Close();
        }

        /// <summary>
        /// 删除所有 索引
        /// </summary>
        public void DelelteAll()
        {
            FSDirectory directory = directory_luce;
            bool isUpdate = IndexReader.IndexExists(directory);
            if (isUpdate)
            {
                if (IndexWriter.IsLocked(directory))
                {
                    IndexWriter.Unlock(directory);
                }
            }
            IndexWriter writer = new IndexWriter(directory, analyzer, !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
            writer.DeleteAll();
            writer.Close();
        }
        #endregion

Query
TermQuery
BooleanQuery
RangeQuery范围搜索
PrefixQuery前缀搜索
PhraseQuery多关键字的搜索
FuzzyQuery 相近词语的搜索

点这里可以跳转到人工智能网站

发表评论