|
1、中文分词(smartcn)
# s9 `( j! f; Q' a# r <dependency>$ |" F4 G3 O! j- H0 Y( A. t
<groupId>org.apache.lucene</groupId>
4 J# ?8 ?# R5 } ] x3 ]4 F4 ? <artifactId>lucene-analyzers-smartcn</artifactId>
5 w% }. M' M; }) q! _ <version>5.3.1</version>
: r" w0 |1 x/ o$ @4 J* H </dependency>; J6 s6 W6 A& c9 D. F
2 y. d; I6 ^; n3 A5 W
# c' S% } t# P8 F5 w! K8 Y
- g( {/ I6 e: F" T6 H4 E
2、高亮显示; x a. [3 p! d# g
<dependency>; q% `" N4 \4 }) }2 U/ A. a3 [
<groupId>org.apache.lucene</groupId>2 C2 O7 O3 f7 ^1 p, Y3 |- O# [9 ^
<artifactId>lucene-highlighter</artifactId>
2 }* Z& |% Z. F+ L <version>5.3.1</version>% D5 R8 _! K( Z5 m; e6 i
</dependency>! m& _" T& c4 R$ [4 H
3 j1 H; y, S4 d; {; {源码:
- A: g, P0 Z8 n8 ^6 P2 s(1)5 J& C: E; A; w& v; y
import java.nio.file.Paths;
3 Q. e" Y' z V' V2 V% z; _! J, P, n' N4 T
import org.apache.lucene.analysis.Analyzer;
2 _- | z! a$ k" q' Simport org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;, e% |1 E1 L8 w+ ?6 ?: W+ \
import org.apache.lucene.analysis.standard.StandardAnalyzer;
Z l; `; e( [; Aimport org.apache.lucene.document.Document;! |( V8 S4 D8 y4 C R% I1 ]
import org.apache.lucene.document.Field;
% N N& K( x5 @8 f! }6 kimport org.apache.lucene.document.IntField;; g* k$ ?, X+ W1 \! I
import org.apache.lucene.document.StringField;3 g( @. M2 A7 l8 |4 C# R9 I
import org.apache.lucene.document.TextField;
2 d7 |; |: b; t0 S9 P: Wimport org.apache.lucene.index.IndexWriter;$ z4 d# _8 ~" F2 W5 s
import org.apache.lucene.index.IndexWriterConfig;1 y: R+ K8 X* j" h2 A2 S
import org.apache.lucene.store.Directory;
1 I& A p0 M, X( A3 N4 \import org.apache.lucene.store.FSDirectory;
2 r" {. L d Q2 p* _8 W* M' T. i; ]# a1 G x' b3 u" a
public class Indexer {+ u+ ?( c; R4 e4 b
5 q8 }& W9 e5 a4 ~; T$ Y
private Integer ids[]={1,2,3};4 z3 ?0 E( Q3 B$ ?+ N- A
private String citys[]={"青岛","南京","上海"};8 C9 V" }" ?# I' f0 R# q
private String descs[]={9 ^0 }6 R* s- I! @. o' C/ X
"青岛是一个美丽的城市。",
5 U9 ^* m: k9 I+ @" d" d/ d "南京是一个有文化的城市。南京是一个文化的城市南京,简称宁,是江苏省会,地处中国东部地区,长江下游,濒江近海。全市下辖11个区,总面积6597平方公里,2013年建成区面积752.83平方公里,常住人口818.78万,其中城镇人口659.1万人。[1-4] “江南佳丽地,金陵帝王州”,南京拥有着6000多年文明史、近2600年建城史和近500年的建都史,是中国四大古都之一,有“六朝古都”、“十朝都会”之称,是中华文明的重要发祥地,历史上曾数次庇佑华夏之正朔,长期是中国南方的政治、经济、文化中心,拥有厚重的文化底蕴和丰富的历史遗存。[5-7] 南京是国家重要的科教中心,自古以来就是一座崇文重教的城市,有“天下文枢”、“东南第一学”的美誉。截至2013年,南京有高等院校75所,其中211高校8所,仅次于北京上海;国家重点实验室25所、国家重点学科169个、两院院士83人,均居中国第三。[8-10] 。",
' |6 F5 ^" C, D0 d3 z+ @+ U* [, G "上海是一个繁华的城市。"
9 w" J5 v$ Z; M0 Y# _% h };. G' o2 t& f: y3 O$ i. ^
9 f0 G4 g, S6 B F9 Q2 M1 P% t private Directory dir;
$ M6 w1 _$ Z) e* g; w5 D
- E/ K- W" i4 P /**
M% ^$ M9 n: [7 f/ F1 j * 获取IndexWriter实例# \/ p" p. m0 O7 R8 x5 p3 M
* @return
3 d! {, W! V O5 j g/ v' o0 { * @throws Exception/ d1 J. M+ w U% a1 o0 s6 c
*/
. U/ ?: p' |8 V$ i( k private IndexWriter getWriter()throws Exception{
' s! X# H0 j+ a! ^) J! }& \ //Analyzer analyzer=new StandardAnalyzer(); // 标准分词器' ?/ o& d5 @' g( s' p6 X* i
SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();0 _, p! f. l- h! c
IndexWriterConfig iwc=new IndexWriterConfig(analyzer);( y/ V% o, b0 P& [
IndexWriter writer=new IndexWriter(dir, iwc);; L u) c5 [" [( p
return writer;# P9 U' C" ]* F
}
6 f& C" x$ s8 N4 c2 z! V( I, @6 ^4 ]0 t' c. I3 a- ]
/**
" S6 X4 |' N2 t$ @6 f6 \ * 生成索引
2 V. j; q Y3 ? * @param indexDir
$ V- `+ j- h2 n8 Z * @throws Exception
: u( ?5 R; U, _( g9 m9 O */
. X2 b6 ?3 H: S2 Z private void index(String indexDir)throws Exception{
2 m9 }3 E1 d! z3 M$ a& ~ dir=FSDirectory.open(Paths.get(indexDir));
4 a1 i+ w3 S/ h IndexWriter writer=getWriter();8 y9 [- J0 \5 |2 y
for(int i=0;i<ids.length;i++){7 _4 e& }. N# ~: g" Z
Document doc=new Document();
0 [; |. {8 a- D( W& h/ R doc.add(new IntField("id", ids, Field.Store.YES));
* H v0 o6 }, }# ~# M4 e6 K doc.add(new StringField("city",citys,Field.Store.YES));/ I5 I) M/ `- U, i4 y5 _
doc.add(new TextField("desc", descs, Field.Store.YES));6 N' s( T3 H" d0 d4 @0 Y8 X
writer.addDocument(doc); // 添加文档3 m6 i) Z9 D* P' c7 P* M3 r, L
}
. x5 r% _' S A* Q+ z! h5 K, ~ writer.close();# B, K! n- e! |' Q
}
/ j* g% b7 M2 ?1 e- ]: Z; c( z# f$ h# \8 h
% k+ Q( M) M3 o/ `& } public static void main(String[] args) throws Exception {9 j K7 C# I5 Q
new Indexer().index("D:\\lucene6");
! m% A% J( B$ x( O, x7 C }: P5 }6 x% b' C) V( Q0 Y
; ]' N3 A$ r K* A, W}
+ f- a! i+ C/ ~ D g. L) X: N
9 W; Y+ I9 e7 ?4 @
1 s3 N1 X, R; ^+ q; j. \) ~. O(2)& B7 g& }4 z8 s, N; D
import java.io.StringReader;
i2 b8 M/ T/ oimport java.nio.file.Paths;& _0 ~4 O/ i7 w' c" q8 n4 }: t
3 [9 q1 u/ e9 a! w; g
import org.apache.lucene.analysis.Analyzer;2 ~6 {0 _8 k ]+ W7 j6 y
import org.apache.lucene.analysis.TokenStream;$ g5 U2 V B4 c+ T; u# I: T
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;, w7 H, n/ @3 U# E# g X' g6 a
import org.apache.lucene.analysis.standard.StandardAnalyzer;! |- D5 b6 U* S# b5 ]
import org.apache.lucene.document.Document;3 f7 i* C! D8 ~$ Q
import org.apache.lucene.index.DirectoryReader;
1 b3 g1 L$ A& Dimport org.apache.lucene.index.IndexReader;
( H* u- \8 }( |4 ]! eimport org.apache.lucene.queryparser.classic.QueryParser;
9 R$ D' m9 w" |* @, x/ k# Ximport org.apache.lucene.search.IndexSearcher;/ a; D! i% _4 [ ~' g& z& Y. h
import org.apache.lucene.search.Query;
: P7 C P8 O$ q4 c, }import org.apache.lucene.search.ScoreDoc;" O& x7 Z$ [( q1 B x; R
import org.apache.lucene.search.TopDocs;+ T5 l# a9 g" O
import org.apache.lucene.search.highlight.Fragmenter;% l: j9 D& [, D" r. Y) Z- x
import org.apache.lucene.search.highlight.Highlighter;
: l# Q n' |( h4 T( Y7 w7 uimport org.apache.lucene.search.highlight.QueryScorer;6 l# H3 Z |. d( ?& t1 K D; |
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;$ M% ?2 O1 B t
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;2 \7 ?# F* o# Z, y% i
import org.apache.lucene.store.Directory;1 A" ?% F' q6 ?
import org.apache.lucene.store.FSDirectory;' p1 Z" b# B7 b5 Z/ d9 S6 n
; Q ]2 |7 O5 y7 h: xpublic class Searcher {' n4 A8 V3 F3 z' u
6 |- x; X. w: h4 u
public static void search(String indexDir,String q)throws Exception{
7 p! \5 j& o* c3 k Directory dir=FSDirectory.open(Paths.get(indexDir));) j1 c1 `# ?( F: r R u6 r
IndexReader reader=DirectoryReader.open(dir);* H: X; m) Y& g* R4 g* {& _: b9 R
IndexSearcher is=new IndexSearcher(reader);
9 u/ k8 M5 f9 ~, q1 `: _4 {1 W // Analyzer analyzer=new StandardAnalyzer(); // 标准分词器( X, M7 U6 n8 g: S
SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();* J3 j7 [2 R+ m& I, x7 h! i; Y
QueryParser parser=new QueryParser("desc", analyzer);, E, s5 V! w0 ^- R: G$ l# \
Query query=parser.parse(q);
! \- a/ Y8 a* V7 v long start=System.currentTimeMillis();
+ Y- l( ^% O6 e7 B1 l/ W3 m TopDocs hits=is.search(query, 10);$ z+ h a. [ m; i3 M" P
long end=System.currentTimeMillis();
/ s: K. F$ S$ r% m System.out.println("匹配 "+q+" ,总共花费"+(end-start)+"毫秒"+"查询到"+hits.totalHits+"个记录");
& ~, i7 }5 \5 t7 ^
% Z2 U6 [7 e3 q# t0 X, v* \) \ QueryScorer scorer=new QueryScorer(query);" [; ^8 o& o) @, x5 y5 k
Fragmenter fragmenter=new SimpleSpanFragmenter(scorer);
4 o7 @8 P' U# G t- u% H3 y SimpleHTMLFormatter simpleHTMLFormatter=new SimpleHTMLFormatter("<b><font color='red'>","</font></b>");
2 g3 }: }# [! | Highlighter highlighter=new Highlighter(simpleHTMLFormatter, scorer);
# T5 |) a! c3 G' N. ~+ m/ U. k highlighter.setTextFragmenter(fragmenter);0 {4 C: T5 X; z
for(ScoreDoc scoreDoc:hits.scoreDocs){
: n: v- c# N8 q/ N2 h- a% w4 x! p Document doc=is.doc(scoreDoc.doc);! G q7 `7 `0 x; O, h5 F- [2 ?
System.out.println(doc.get("city"));! F# o- V' \! @* w4 n' V
System.out.println(doc.get("desc"));& w# u3 d2 N& ~- Z, H4 M
String desc=doc.get("desc");
1 U2 \% @6 [2 [" s4 G if(desc!=null){
" s1 [; Z1 t d1 j: k% F TokenStream tokenStream=analyzer.tokenStream("desc", new StringReader(desc));: A* t! ~- a! k4 Y! K" R8 k
System.out.println(highlighter.getBestFragment(tokenStream, desc));' i$ p7 A" E8 Y/ D- j
}" m0 Z! k* ?2 q0 m! z
}
5 m& S5 i4 d# W; ?; ~% A reader.close();
i g h: B& S }/ b8 [7 {) k; L6 o
9 c# i( I3 a! D. p m P public static void main(String[] args) {. F2 x& h6 ^5 H; F2 W
String indexDir="D:\\lucene6";; P c; L0 d' [/ @$ q
String q="南京文明";
7 J6 P; k- p0 S# F3 J% E! p try {- |( R- a9 j1 k
search(indexDir,q);
. O P" E- i8 }* w0 `9 E } catch (Exception e) {& w- q0 a2 O4 K
// TODO Auto-generated catch block2 {' v4 t: C5 Z1 b* M* `5 {3 L4 O2 A
e.printStackTrace();
) E7 @) k! [ t4 b }4 Y5 N+ ?1 C B R0 l
}
7 i. d2 k7 ]5 K; r w- X/ d}0 a, \# _. H9 W4 s! C2 K7 v
7 b# }; s" _% `0 @/ L8 c
F- R% a2 b* l. N8 R% P, p* |, ^
& X0 v5 m' e' y$ K
0 ~6 I7 b1 ]$ X! k, S9 R0 w5 m; ^ @2 p0 y6 H+ ~5 ]
|
|