首页 技术 正文
技术 2022年11月15日
0 收藏 943 点赞 3,763 浏览 11930 个字

之前公司要做一个信息展示的网站,领导说要用lucene.net来实现全文检索,类似百度的搜索功能,但是本人技术有限,只是基本实现搜索和高亮功能,特此记录;

先看下页面效果,首先我搜索“为什么APP消息没有推送”,出来的结果如下图:

记录lucene.net的使用过程

然后我再搜索“醒 消息 推”,出来结果如下图:

记录lucene.net的使用过程

然后说下,我使用的是Lucene.net版本是2.9.22,盘古分词的版本是2.3.1,注意,版本lucene.net和盘古分词的版本一定要对上,之前我用Lucene.net3.0的版本,就一直有错误,后来换到低版本才没问题的;

接着是关键的类LuceneHelper,如下所示:

 public class LuceneHelper
{
readonly LogHelper _logHelper = new LogHelper(MethodBase.GetCurrentMethod());
private LuceneHelper() { } #region 单例
private static LuceneHelper _instance = null;
private static readonly object Lock = new object();
/// <summary>
/// 单例
/// </summary>
public static LuceneHelper instance
{
get
{
lock (Lock)
{
if (_instance == null)
{
_instance = new LuceneHelper();
PanGu.Segment.Init(PanGuXmlPath);//使用盘古分词,一定要记得初始化
}
return _instance;
}
}
}
#endregion #region 分词测试 /// <summary>
/// 处理关键字为索引格式
/// </summary>
/// <param name="keywords"></param>
/// <returns></returns>
private string GetKeyWordsSplitBySpace(string keywords)
{
PanGuTokenizer ktTokenizer = new PanGuTokenizer();//使用盘古分词器来吧关键字分词
StringBuilder result = new StringBuilder();
ICollection<WordInfo> words = ktTokenizer.SegmentToWordInfos(keywords);
foreach (WordInfo word in words)
{
if (word == null)
{
continue;
}
//result.AppendFormat("{0}^{1}.0 ", word.Word, (int)Math.Pow(3, word.Rank));
result.AppendFormat("{0} ", word.Word);
}
return result.ToString().Trim();
}
#endregion #region 创建索引
/// <summary>
/// 创建索引
/// </summary>
/// <param name="datalist"></param>
/// <returns></returns>
public bool CreateIndex<T>(IList<T> datalist)
{
IndexWriter writer = null;
try
{
writer = new IndexWriter(directory_luce, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);//false表示追加(true表示删除之前的重新写入)
//writer = new IndexWriter(directory_luce, null, true, IndexWriter.MaxFieldLength.LIMITED);//false表示追加(true表示删除之前的重新写入)
}
catch
{
writer = new IndexWriter(directory_luce, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);//false表示追加(true表示删除之前的重新写入)
//writer = new IndexWriter(directory_luce, null, true, IndexWriter.MaxFieldLength.LIMITED);//false表示追加(true表示删除之前的重新写入)
}
foreach (var data in datalist)
{
CreateIndex<T>(writer, data);
}
writer.Optimize();
writer.Close();
return true;
} public bool CreateIndex<T>(IndexWriter writer, T data)
{
try
{ if (data == null) return false;
Document doc = new Document();
Type type = data.GetType(); //创建类的实例
//object obj = Activator.CreateInstance(type, true);
//获取公共属性
PropertyInfo[] Propertys = type.GetProperties();
for (int i = ; i < Propertys.Length; i++)
{
//Propertys[i].SetValue(Propertys[i], i, null); //设置值
PropertyInfo pi = Propertys[i];
string name = pi.Name;
object objval = pi.GetValue(data, null);
string value = objval == null ? "" : objval.ToString(); //值
if (name.ToLower() == "id" || name.ToLower() == "type")//id在写入索引时必是不分词,否则是模糊搜索和删除,会出现混乱
{
doc.Add(new Field(name, value, Field.Store.YES, Field.Index.NOT_ANALYZED));//id不分词
}
else if (name.ToLower() == "IsNewest".ToLower())
{
//doc.Add(new Field(name, value, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS));//分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间
doc.Add(new Field(name, value, Field.Store.YES, Field.Index.NOT_ANALYZED));//IsNewest不分词
}
else if (name.ToLower() == "IsReqular".ToLower())
{
//doc.Add(new Field(name, value, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS));//分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间
doc.Add(new Field(name, value, Field.Store.YES, Field.Index.NOT_ANALYZED));//IsReqular不分词
}
else
{
if (name.ToLower() == "Contents".ToLower())
{
value = GetNoHtml(value);//去除正文的html标签
}
doc.Add(new Field(name, value, Field.Store.YES, Field.Index.ANALYZED));//其他字段分词
}
}
writer.AddDocument(doc);
}
catch (System.IO.FileNotFoundException fnfe)
{
throw fnfe;
}
return true;
}
#endregion #region 在title和content字段中查询数据,该方法未使用,可能有错漏,我使用的是下面的分页查询的;
/// <summary>
/// 在title和content字段中查询数据
/// </summary>
/// <param name="keyword"></param>
/// <returns></returns>
public List<Questions> Search(string keyword)
{ string[] fileds = { "Title", "Contents" };//查询字段
//Stopwatch st = new Stopwatch();
//st.Start();
QueryParser parser = null;// new QueryParser(Lucene.Net.Util.Version.LUCENE_30, field, analyzer);//一个字段查询
parser = new MultiFieldQueryParser(version, fileds, analyzer);//多个字段查询
Query query = parser.Parse(keyword);
int n = ;
IndexSearcher searcher = new IndexSearcher(directory_luce, true);//true-表示只读
TopDocs docs = searcher.Search(query, (Filter)null, n);
if (docs == null || docs.totalHits == )
{
return null;
}
else
{
List<Questions> list = new List<Questions>();
int counter = ;
foreach (ScoreDoc sd in docs.scoreDocs)//遍历搜索到的结果
{
try
{
Document doc = searcher.Doc(sd.doc); string id = doc.Get("ID");
string title = doc.Get("Title");
string content = doc.Get("Contents"); string createdate = doc.Get("AddTime");
PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter = new PanGu.HighLight.SimpleHTMLFormatter("<font color=\"red\">", "</font>");
PanGu.HighLight.Highlighter highlighter = new PanGu.HighLight.Highlighter(simpleHTMLFormatter, new PanGu.Segment());
highlighter.FragmentSize = Int32.MaxValue;
content = highlighter.GetBestFragment(keyword, content);
string titlehighlight = highlighter.GetBestFragment(keyword, title);
if (titlehighlight != "") title = titlehighlight; Questions model = new Questions
{
ID = int.Parse(id),
Title = title,
Contents = content,
AddTime = DateTime.Parse(createdate)
}; list.Add(model);
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
counter++;
}
return list;
}
//st.Stop();
//Response.Write("查询时间:" + st.ElapsedMilliseconds + " 毫秒<br/>"); }
#endregion #region 在不同的分类下再根据title和content字段中查询数据(分页)
/// <summary>
/// 在不同的类型下再根据title和content字段中查询数据(分页)
/// </summary>
/// <param name="_type">分类,传空值查询全部</param>
/// <param name="keyword"></param>
/// <param name="PageIndex"></param>
/// <param name="PageSize"></param>
/// <param name="TotalCount"></param>
/// <returns></returns>
public List<Questions> Search(string _type,bool? _isnew,bool? _isreq ,string keyword, int PageIndex, int PageSize, out int TotalCount)
{
try
{
if (PageIndex < ) PageIndex = ;
//Stopwatch st = new Stopwatch();
//st.Start();
BooleanQuery bq = new BooleanQuery();
if (_type != "" && _type != "-100")
{
QueryParser qpflag = new QueryParser(version, "Type", analyzer);//一个字段查询
Query qflag = qpflag.Parse(_type);
bq.Add(qflag, Lucene.Net.Search.BooleanClause.Occur.MUST);//与运算
}
if (_isnew.HasValue)
{
QueryParser qpnew = new QueryParser(version, "IsNewest", analyzer);
Query qnew = qpnew.Parse(_isnew.Value.ToString());
bq.Add(qnew, Lucene.Net.Search.BooleanClause.Occur.MUST);
}
if (_isreq.HasValue)
{
QueryParser qpreq = new QueryParser(version, "IsReqular", analyzer);
Query qreq = qpreq.Parse(_isnew.Value.ToString());
bq.Add(qreq, Lucene.Net.Search.BooleanClause.Occur.MUST);
} string keyword2 = keyword;
if (keyword != "")
{ keyword = GetKeyWordsSplitBySpace(keyword); string[] fileds = { "Title", "Contents" };//查询字段
QueryParser parser = null;// new QueryParser(version, field, analyzer);//一个字段查询
parser = new MultiFieldQueryParser(version, fileds, analyzer);//多个字段查询
//parser.DefaultOperator = QueryParser.Operator.OR;
parser.SetDefaultOperator(QueryParser.Operator.OR);//这里QueryParser.Operator.OR表示并行结果,相当于模糊搜索,QueryParser.Operator.AND相当于精准搜索
Query queryKeyword = parser.Parse(keyword); bq.Add(queryKeyword, Lucene.Net.Search.BooleanClause.Occur.MUST);//与运算
} //TopScoreDocCollector collector = TopScoreDocCollector.Create(PageIndex * PageSize, false);
IndexSearcher searcher = new IndexSearcher(directory_luce, true);//true-表示只读 //Sort sort = new Sort(new SortField("AddTime", SortField.DOC, false)); //此处为结果排序功能,但是使用排序会影响搜索权重(类似百度搜索排名机制)
//TopDocs topDocs = searcher.Search(bq, null, PageIndex * PageSize, sort);
TopDocs topDocs = searcher.Search(bq, null, PageIndex * PageSize);
//searcher.Search(bq, collector);
if (topDocs == null || topDocs.totalHits == )
{
TotalCount = ;
return null;
}
else
{
int start = PageSize * (PageIndex - );
//结束数
int limit = PageSize;
ScoreDoc[] hits = topDocs.scoreDocs;
List<Questions> list = new List<Questions>();
int counter = ;
TotalCount = topDocs.totalHits;//获取Lucene索引里的记录总数 //Lucene.Net.Highlight.SimpleHTMLFormatter simpleHTMLFormatter = new Lucene.Net.Highlight.SimpleHTMLFormatter("<em class=\"hl-l-t-main\">", "</em>");
//Lucene.Net.Highlight.Highlighter highlighter = new Lucene.Net.Highlight.Highlighter(simpleHTMLFormatter,new Lucene.Net.Highlight.QueryScorer(bq)); foreach (ScoreDoc sd in hits)//遍历搜索到的结果
{
try
{
Document doc = searcher.Doc(sd.doc);
string id = doc.Get("ID");
string title = doc.Get("Title");
string content = doc.Get("Contents");
string updatetime = doc.Get("AddTime"); PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter = new PanGu.HighLight.SimpleHTMLFormatter("<em class=\"hl-l-t-main\">", "</em>");
PanGu.HighLight.Highlighter highlighter = new PanGu.HighLight.Highlighter(simpleHTMLFormatter, new Segment());//搜索关键字高亮显示,上面的高亮样式自己写
highlighter.FragmentSize = Int32.MaxValue; //这里如果值小于搜索内容的长度的话,会导致搜索结果被截断,因此设置最大,根据需求来吧
string contentHighlight = highlighter.GetBestFragment(keyword2, content);
string titleHighlight = highlighter.GetBestFragment(keyword2, title); //string titleHighlight = highlighter.GetBestFragment(analyzer, "Title", title); //string contentHighlight = highlighter.GetBestFragment(analyzer, "Contents", content); title = string.IsNullOrEmpty(titleHighlight) ? title : titleHighlight;
content = string.IsNullOrEmpty(contentHighlight) ? content : contentHighlight; var model = new Questions
{
ID = int.Parse(id),
Title = title,
Contents = content,
AddTime = DateTime.Parse(updatetime)
};
list.Add(model);
}
catch (Exception ex)
{
//这里可以写错误日志
}
counter++;
}
return list;
}
//st.Stop();
}
catch (Exception e)
{
TotalCount = ;
return null;
} } /// <summary>
/// 去除html标签
/// </summary>
/// <param name="StrHtml"></param>
/// <returns></returns>
public string GetNoHtml(string StrHtml)
{
string strText="";
if (!string.IsNullOrEmpty(StrHtml))
{
strText = System.Text.RegularExpressions.Regex.Replace(StrHtml, @"<[^>]+>", "");
strText = System.Text.RegularExpressions.Regex.Replace(strText, @"&[^;]+;", "");
strText = System.Text.RegularExpressions.Regex.Replace(strText, @"\\s*|\t|\r|\n", ""); }
return strText; }
#endregion #region 删除索引数据(根据id)
/// <summary>
/// 删除索引数据(根据id)
/// </summary>
/// <param name="id"></param>
/// <returns></returns>
public bool Delete(string id)
{
bool IsSuccess = false;
Term term = new Term("id", id);
//Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
//Version version = new Version();
//MultiFieldQueryParser parser = new MultiFieldQueryParser(version, new string[] { "name", "job" }, analyzer);//多个字段查询
//Query query = parser.Parse("小王"); //IndexReader reader = IndexReader.Open(directory_luce, false);
//reader.DeleteDocuments(term);
//Response.Write("删除记录结果: " + reader.HasDeletions + "<br/>");
//reader.Dispose(); IndexWriter writer = new IndexWriter(directory_luce, analyzer, false, IndexWriter.MaxFieldLength.LIMITED);
writer.DeleteDocuments(term); // writer.DeleteDocuments(term)或者writer.DeleteDocuments(query);
////writer.DeleteAll();
writer.Commit();
//writer.Optimize();//
IsSuccess = writer.HasDeletions();
writer.Close();
return IsSuccess;
}
#endregion #region 删除全部索引数据
/// <summary>
/// 删除全部索引数据
/// </summary>
/// <returns></returns>
public bool DeleteAll()
{
bool IsSuccess = true;
try
{
IndexWriter writer = new IndexWriter(directory_luce, analyzer, false, IndexWriter.MaxFieldLength.LIMITED);
writer.DeleteAll();
writer.Commit();
writer.Optimize();//
IsSuccess = writer.HasDeletions();
writer.Close();
}
catch
{
IsSuccess = false;
}
return IsSuccess;
}
#endregion #region directory_luce
private Lucene.Net.Store.Directory _directory_luce = null;
/// <summary>
/// Lucene.Net的目录-参数
/// </summary>
public Lucene.Net.Store.Directory directory_luce
{
get
{
if (_directory_luce == null) _directory_luce = Lucene.Net.Store.FSDirectory.Open(directory);
return _directory_luce;
}
}
#endregion #region directory
private System.IO.DirectoryInfo _directory = null;
/// <summary>
/// 索引在硬盘上的目录
/// </summary>
public System.IO.DirectoryInfo directory
{
get
{
if (_directory == null)
{
string dirPath = HttpContext.Current.Server.MapPath("/LuceneDic");
if (System.IO.Directory.Exists(dirPath) == false)
_directory = System.IO.Directory.CreateDirectory(dirPath);
else
_directory = new System.IO.DirectoryInfo(dirPath);
}
return _directory;
}
}
#endregion #region analyzer
private Analyzer _analyzer = null;
/// <summary>
/// 分析器
/// </summary>
public Analyzer analyzer
{
get
{
//if (_analyzer == null)
{
// _analyzer = new Lucene.Net.Analysis.PanGu.PanGuAnalyzer();//弃用盘古分词,感觉有点问题,测试下来没有自带分词好用,也有可能是好用的,但是之前用的高版本lucene.net,导致分词失效
_analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);
}
return _analyzer;
}
}
#endregion #region version
private static Lucene.Net.Util.Version _version = Lucene.Net.Util.Version.LUCENE_29;
/// <summary>
/// 版本号枚举类
/// </summary>
public Lucene.Net.Util.Version version
{
get
{
return _version;
}
}
#endregion
/// <summary>
/// 盘古分词的配置文件
/// </summary>
protected static string PanGuXmlPath
{
get
{
return HttpContext.Current.Server.MapPath("/PanGu/PanGu.xml");
}
}
}

 然后是一些需要引用的DLL和盘古分词的字典文件等,如下所示:

lucene.net和盘古分词DLL和文件等.rar

至此Lucene.net的简单应用到此结束,谢谢!

相关推荐
python开发_常用的python模块及安装方法
adodb:我们领导推荐的数据库连接组件bsddb3:BerkeleyDB的连接组件Cheetah-1.0:我比较喜欢这个版本的cheeta…
日期:2022-11-24 点赞:878 阅读:9,499
Educational Codeforces Round 11 C. Hard Process 二分
C. Hard Process题目连接:http://www.codeforces.com/contest/660/problem/CDes…
日期:2022-11-24 点赞:807 阅读:5,912
下载Ubuntn 17.04 内核源代码
zengkefu@server1:/usr/src$ uname -aLinux server1 4.10.0-19-generic #21…
日期:2022-11-24 点赞:569 阅读:6,746
可用Active Desktop Calendar V7.86 注册码序列号
可用Active Desktop Calendar V7.86 注册码序列号Name: www.greendown.cn Code: &nb…
日期:2022-11-24 点赞:733 阅读:6,503
Android调用系统相机、自定义相机、处理大图片
Android调用系统相机和自定义相机实例本博文主要是介绍了android上使用相机进行拍照并显示的两种方式,并且由于涉及到要把拍到的照片显…
日期:2022-11-24 点赞:512 阅读:8,142
Struts的使用
一、Struts2的获取  Struts的官方网站为:http://struts.apache.org/  下载完Struts2的jar包,…
日期:2022-11-24 点赞:671 阅读:5,305