扫一扫
分享文章到微信
扫一扫
关注官方公众号
至顶头条
在实现了中文切词的基础方法上,我将其封装在继承lucene的Analyzer类下
chineseAnalzer的方法就不用多说了。
以下是引用片段: using System; using System.Collections.Generic; using System.Text; using Lucene.Net.Analysis; using Lucene.Net.Analysis.Standard; namespace Lucene.Fanswo { /**//// /// /// public class ChineseAnalyzer:Analyzer { //private System.Collections.Hashtable stopSet; public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new System.String[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "我", "我们" }; /**//// Constructs a {@link StandardTokenizer} filtered by a {@link /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. /// public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { TokenStream result = new ChineseTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS); return result; } } } |
ChineseTokenizer类的实现:
这里通过词典来正向匹配字符,返回lucene下定义的token流
以下是引用片段: using System; using System.Collections.Generic; using System.Text; using Lucene.Net.Analysis; using System.Collections; using System.Text.RegularExpressions; using System.IO; namespace Lucene.Fanswo { class ChineseTokenizer : Tokenizer { private int offset = 0, bufferIndex = 0, dataLen = 0;//偏移量,当前字符的位置,字符长度 private int start;//开始位置 /**//// /// 存在字符内容 /// private string text; /**//// /// 切词所花费的时间 /// public double TextSeg_Span = 0; /**//// Constructs a tokenizer for this Reader. public ChineseTokenizer(System.IO.TextReader reader) { this.input = reader; text = input.ReadToEnd(); dataLen = text.Length; } /**//// 进行切词,返回数据流中下一个token或者数据流为空时返回null /// /// public override Token Next() { Token token = null; WordTree tree = new WordTree(); //读取词库 tree.LoadDict(); //初始化词库,为树形 Hashtable t_chartable = WordTree.chartable; string ReWord = ""; string char_s; start = offset; bufferIndex = start; while (true) { //开始位置超过字符长度退出循环 if (start >= dataLen) { break; } //获取一个词 char_s = text.Substring(start, 1); if (string.IsNullOrEmpty(char_s.Trim())) { start++; continue; } //字符不在字典中 if (!t_chartable.Contains(char_s)) { if (ReWord == "") { int j = start + 1; switch (tree.GetCharType(char_s)) { case 0://中文单词 ReWord += char_s; break; case 1://英文单词 j = start + 1; while (j < dataLen) { if (tree.GetCharType(text.Substring(j, 1)) != 1) break; j++; } ReWord += text.Substring(start, j - offset); break; case 2://数字 j = start + 1; while (j < dataLen) { if (tree.GetCharType(text.Substring(j, 1)) != 2) break; j++; } ReWord += text.Substring(start, j - offset); break; default: ReWord += char_s;//其他字符单词 break; } offset = j;//设置取下一个词的开始位置 } else { offset = start;//设置取下一个词的开始位置 } //返回token对象 return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1); } //字符在字典中 ReWord += char_s; //取得属于当前字符的词典树 t_chartable = (Hashtable)t_chartable[char_s]; //设置下一循环取下一个词的开始位置 start++; if (start == dataLen) { offset = dataLen; return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1); } } return token; } } } |
测试的代码:
如果您非常迫切的想了解IT领域最新产品与技术信息,那么订阅至顶网技术邮件将是您的最佳途径之一。
现场直击|2021世界人工智能大会
直击5G创新地带,就在2021MWC上海
5G已至 转型当时——服务提供商如何把握转型的绝佳时机
寻找自己的Flag
华为开发者大会2020(Cloud)- 科技行者