|
1、中文分词(smartcn)
0 J; G- Z' f' m: P" Y+ d <dependency>
. E( O' c7 i* |0 Z- H4 X <groupId>org.apache.lucene</groupId>
% |, x9 f2 R" k <artifactId>lucene-analyzers-smartcn</artifactId>
6 q9 f7 B3 D2 B# g, T) L) A ^ <version>5.3.1</version>
. s$ [0 Y/ G. |9 k! C) g1 Q </dependency>
J! l+ ^8 Y- a) Y# _
3 m3 v! V- B3 T: U* K( M% ? W) p% Y# m# [& O' Y
! n3 ^) R6 u" n, ^$ m R. w% \+ ?$ f" P2、高亮显示9 T/ v6 ~) Y5 l* z1 n% v
<dependency>
6 `0 V- P! Q' ~0 B) d" z <groupId>org.apache.lucene</groupId>: `. l! j- |1 B
<artifactId>lucene-highlighter</artifactId>
0 s Z/ ?5 s. B" ]8 q7 S <version>5.3.1</version>. A2 a- ^5 T0 H1 x; k, k: q
</dependency>
' N9 h3 P; A2 ?& [5 i! b
6 x1 E- `7 y8 \" f- D' G7 p源码:" s3 _2 D& f: A8 U
(1)
, n/ Z, x' J! h7 ]. i& U0 zimport java.nio.file.Paths;
0 U1 F7 V y% n0 X; i3 f
* s5 z+ H n/ C4 W5 m* Zimport org.apache.lucene.analysis.Analyzer;
# ?: H- i, v1 Z2 f1 q8 V- Dimport org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
. Z! c" o# N0 o3 N' K( w$ Dimport org.apache.lucene.analysis.standard.StandardAnalyzer;
4 ~& Z) |; y; V; R; Zimport org.apache.lucene.document.Document; _' [# L$ }! Y5 y3 ]
import org.apache.lucene.document.Field;7 ^! M* t$ S4 b n' H% o
import org.apache.lucene.document.IntField;4 Q; S7 h' Z* z8 I! k; O1 L" B
import org.apache.lucene.document.StringField;
0 a5 B8 t1 `0 Q0 M* ?import org.apache.lucene.document.TextField;
n+ M) s" W; V0 C" i+ |( y8 a3 Pimport org.apache.lucene.index.IndexWriter;
5 N, d3 T2 S, ^ a% g; J7 vimport org.apache.lucene.index.IndexWriterConfig;: ^) d1 f& O; Z& P5 a4 N
import org.apache.lucene.store.Directory;, Z# t& J8 Q; o/ u
import org.apache.lucene.store.FSDirectory;9 w# w( P: X" `2 R8 Q$ |
5 G: w( R/ z4 A! [$ h
public class Indexer {
% [2 f* T9 E' N/ o; r' p% H$ [4 E1 c& e9 E+ d3 I
private Integer ids[]={1,2,3}; y$ f% U8 K+ y: H( p# I- ^
private String citys[]={"青岛","南京","上海"};
; _6 W0 n7 C6 f* u' i' B# { private String descs[]={
+ S8 t/ {; ^, u+ X "青岛是一个美丽的城市。",
) s% U/ V# u0 J' f5 W" `5 _* u( u "南京是一个有文化的城市。南京是一个文化的城市南京,简称宁,是江苏省会,地处中国东部地区,长江下游,濒江近海。全市下辖11个区,总面积6597平方公里,2013年建成区面积752.83平方公里,常住人口818.78万,其中城镇人口659.1万人。[1-4] “江南佳丽地,金陵帝王州”,南京拥有着6000多年文明史、近2600年建城史和近500年的建都史,是中国四大古都之一,有“六朝古都”、“十朝都会”之称,是中华文明的重要发祥地,历史上曾数次庇佑华夏之正朔,长期是中国南方的政治、经济、文化中心,拥有厚重的文化底蕴和丰富的历史遗存。[5-7] 南京是国家重要的科教中心,自古以来就是一座崇文重教的城市,有“天下文枢”、“东南第一学”的美誉。截至2013年,南京有高等院校75所,其中211高校8所,仅次于北京上海;国家重点实验室25所、国家重点学科169个、两院院士83人,均居中国第三。[8-10] 。",4 f$ q8 j. s/ `, F- p
"上海是一个繁华的城市。"6 M4 W1 T5 p2 U8 }; m
};& o3 f _3 i& s; w/ q
2 z* o9 L! u7 Q; s- `: S' {
private Directory dir;& C7 p) j2 x S' [: ~: @3 h' W
: P2 p- b. v g# T: _ /**# e7 [8 R1 L$ n
* 获取IndexWriter实例
l' h6 u# q# k, U/ y * @return
6 D7 v' V4 f8 d& n+ ?$ E) V * @throws Exception
* {+ m$ y4 q2 O */
0 D( `+ K) B G2 Q private IndexWriter getWriter()throws Exception{) @" R) d9 L. ]) Q7 ~. |
//Analyzer analyzer=new StandardAnalyzer(); // 标准分词器( U. r& Z+ b; X7 D$ ?
SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();# R' Z$ w$ \$ w* E& K3 i7 h
IndexWriterConfig iwc=new IndexWriterConfig(analyzer);+ u$ _9 W2 @4 t5 p/ t, O9 S1 _, t
IndexWriter writer=new IndexWriter(dir, iwc);
3 Y# o" a7 h8 b- a+ o, x return writer;
2 o$ W" y, V7 N' I }
) u4 `% b! E8 Z0 K& K0 g8 V
9 ?9 r& @, a% k, e4 Q; ~& n /**
2 f/ Z, ^: L8 i/ n * 生成索引, w/ @$ k% @: o; v: d" h
* @param indexDir0 w1 V0 ?2 `" Q {7 {6 j ^
* @throws Exception
$ W G! S6 Z F0 F) G */
7 c; Q( F- V" e private void index(String indexDir)throws Exception{
' e- M7 ^8 Q9 @1 {; A5 y dir=FSDirectory.open(Paths.get(indexDir));
; L, \" U4 N# I2 S& f3 r7 {& ? IndexWriter writer=getWriter();
) \3 L. g6 y' g; i5 d" c for(int i=0;i<ids.length;i++){
+ L+ M" v& o, K8 O' S; c Document doc=new Document();
$ V, [9 t9 \4 I) R% E doc.add(new IntField("id", ids, Field.Store.YES));
2 c; R; J; i0 D: t/ c1 j6 O doc.add(new StringField("city",citys,Field.Store.YES));9 [2 e( H" m: F m% m/ g9 ^ [
doc.add(new TextField("desc", descs, Field.Store.YES));
& `5 Y A* M/ p/ ` writer.addDocument(doc); // 添加文档
& P9 @: h* c* \: I }' h0 J: W- x' |1 o! N9 a. _! f( V' g
writer.close();
% G: S& T9 J7 W) o& K0 Z }! F7 n. u4 v* \6 ?! t
5 c% L' C2 g7 ~' H9 r+ H, ~6 K
& ^+ l) I& o0 u' T: B3 G public static void main(String[] args) throws Exception {
]- Q! @5 N' m) ]5 L) f9 p' K% d new Indexer().index("D:\\lucene6");
+ [8 d3 f% ~+ t* v4 x! D$ R }
9 l, g' m( [) [% c, ^* t* T$ E5 X$ P L- ?; B# c
}
8 ?$ p/ A p0 ^) ? E) f* \9 L- b) J) g V: z# Y
4 k/ I) P9 m3 B, X
6 L8 [( q- ?) Y' B(2)9 Z5 |2 r$ \: E/ v
import java.io.StringReader;' b/ x: z# b+ }: V8 u
import java.nio.file.Paths;
3 k$ E% u$ h B
7 i2 B$ `' k; y# t4 oimport org.apache.lucene.analysis.Analyzer;
* ]" R" k$ |8 j( y6 b; Eimport org.apache.lucene.analysis.TokenStream;
6 E# C4 c. |8 `) M0 q7 m. [4 J' Mimport org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
$ h3 n+ O8 S9 K; m2 limport org.apache.lucene.analysis.standard.StandardAnalyzer;
& d0 P0 }- |- n9 T D) Ximport org.apache.lucene.document.Document;, g- ?$ u4 O9 O8 v
import org.apache.lucene.index.DirectoryReader;
$ r# y7 L$ A3 ?" q, Yimport org.apache.lucene.index.IndexReader;' J v/ N4 j1 L/ m# l* y4 R
import org.apache.lucene.queryparser.classic.QueryParser;) ^8 A: R2 l) L( @" K( S) I
import org.apache.lucene.search.IndexSearcher;: h; ^8 I8 p% ^) ?
import org.apache.lucene.search.Query;8 A4 Y8 J( z, ^, {: B
import org.apache.lucene.search.ScoreDoc;
: b; r2 A& C1 S+ L" k" Jimport org.apache.lucene.search.TopDocs;
+ ^7 L$ @' J# Iimport org.apache.lucene.search.highlight.Fragmenter;
& b+ _. `5 J4 ?% D: q! ^import org.apache.lucene.search.highlight.Highlighter;9 R6 `7 N2 B, `0 g3 P/ D
import org.apache.lucene.search.highlight.QueryScorer;2 R& A, ]1 Q8 ^! P8 Z& ]
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;& ~8 X7 [6 m" l7 r, ]7 B3 ]
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;* D0 i- L+ ?0 v" F) @# G
import org.apache.lucene.store.Directory;
- Y$ H Z/ X% S: C7 [- x4 h D/ pimport org.apache.lucene.store.FSDirectory;/ s2 R$ l( E5 w1 G# m, r5 e8 l
' c! X( B E* c7 B" W0 E+ Y" |, Epublic class Searcher {0 O. |. l. I& }3 V2 x- a( L
/ f1 W/ Z" N% }3 p# e$ M2 Y public static void search(String indexDir,String q)throws Exception{
6 i5 d J. o8 g Directory dir=FSDirectory.open(Paths.get(indexDir));
& D D2 B' I% R; J0 Q7 N) C8 S IndexReader reader=DirectoryReader.open(dir);$ U4 x4 ?; n" k: j8 M
IndexSearcher is=new IndexSearcher(reader);
% x+ ]7 N. F% R+ ^2 B* }! I // Analyzer analyzer=new StandardAnalyzer(); // 标准分词器6 y4 ] ]* F7 h: a
SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
4 l8 u! |6 a& P5 ^9 Q. U QueryParser parser=new QueryParser("desc", analyzer);
; s! E& |% Z- w4 d Query query=parser.parse(q);( w4 t9 L+ |% F: T
long start=System.currentTimeMillis();2 F S3 u0 r$ _' w( F# y+ ?
TopDocs hits=is.search(query, 10);
" Q9 q! e x$ Y: x/ C" c, T Z _. C long end=System.currentTimeMillis();
& j2 Z, [' z( S( O, C0 f System.out.println("匹配 "+q+" ,总共花费"+(end-start)+"毫秒"+"查询到"+hits.totalHits+"个记录");4 N& L; i! @. U, l T5 s3 |; I/ b
( j$ \& E3 l) R" i( V% j QueryScorer scorer=new QueryScorer(query);( P5 _; D+ C4 W, Y
Fragmenter fragmenter=new SimpleSpanFragmenter(scorer);: S& p9 d2 p% m3 T
SimpleHTMLFormatter simpleHTMLFormatter=new SimpleHTMLFormatter("<b><font color='red'>","</font></b>");; r( E6 n" z# C8 K; M
Highlighter highlighter=new Highlighter(simpleHTMLFormatter, scorer);8 `( q/ p' ^/ \% [( y* z4 M. m
highlighter.setTextFragmenter(fragmenter);
! r9 j- Y$ }, S- B& M+ ?0 r% v for(ScoreDoc scoreDoc:hits.scoreDocs){
. q, K1 S" X( k8 D( `1 f H Document doc=is.doc(scoreDoc.doc);4 z' e) X6 i+ c% V8 N$ i- |
System.out.println(doc.get("city"));
9 y" Y5 _ R B1 B System.out.println(doc.get("desc"));0 v' W. H# X2 z, q! W0 W
String desc=doc.get("desc");6 ^1 C) K- j: g K; @! l* M
if(desc!=null){
$ T4 U( j, C7 v TokenStream tokenStream=analyzer.tokenStream("desc", new StringReader(desc));" W: @5 m) B4 \- @$ k6 O
System.out.println(highlighter.getBestFragment(tokenStream, desc));
9 S% ~3 S- m/ G& N8 t }6 u, r$ B j/ Z1 g
}
- }4 ?9 V3 r( y6 V reader.close();
2 ]* b6 _. C( q }5 c, `& j: |; k8 Y( l
9 D( S" F7 q- v/ Z public static void main(String[] args) {, O5 p5 y0 P7 P% L% D9 k
String indexDir="D:\\lucene6";
2 v, f5 w) M) P1 A& V# u# V X String q="南京文明";
+ ?9 W# p& q3 C: f- o6 \( x try {
6 t9 u2 E( c1 t) \$ d9 ? search(indexDir,q);2 n9 E% `( z! x; o! U2 |( c
} catch (Exception e) {
* E$ }9 c9 { J z. P3 z, U$ A' O // TODO Auto-generated catch block$ H$ _3 Z5 ]& U5 h
e.printStackTrace();' x% j; s/ m z3 f9 ?# K9 I
}
( D- P5 ?% Y- p) q, @1 r }
) [0 ]' d. e& y. w3 [/ M Q8 N- O p}
! Z" n+ |7 m' d& m- b9 f5 K1 v& v
' u1 u; \$ j! z: w. W% M
`6 A7 d; ^. ?" q
/ E- e; n: V T$ J- f& U$ H; t9 q$ y8 I7 T8 I& i7 k( X
: C& e1 A+ j. ~; z6 x |
|