|
1、中文分词(smartcn)
, L+ g% g3 A. ~/ @- @+ c1 {# U3 r <dependency>& `6 E! g: I4 @2 L& ?2 w, I3 R9 |
<groupId>org.apache.lucene</groupId>
+ m. n6 A7 N* ~# } <artifactId>lucene-analyzers-smartcn</artifactId>
' e7 S# B9 \( p9 a% E, X3 u <version>5.3.1</version>
" a& c" M3 V% B+ w9 `. D </dependency>6 r4 e/ ?: A9 S8 S
5 P+ W$ d4 A8 u
/ t: S2 e8 @2 |2 W! X$ T: A
% c6 X2 V X) D. g2、高亮显示 q0 @; i! M; L2 K5 C# G- [
<dependency>
1 [+ o# L6 \$ j. g& t <groupId>org.apache.lucene</groupId> K1 `# M" }* I( N* ?6 M
<artifactId>lucene-highlighter</artifactId>
6 x# b" P/ r0 y* o+ r <version>5.3.1</version>) j0 W+ ~- f, U! g) v
</dependency>
4 ^3 {4 {5 T$ z" N3 G& B+ P
! s7 b$ h" c1 ~6 P源码:
6 `3 h3 V' Y9 e* z(1)) N# @$ \/ w H' X
import java.nio.file.Paths;! ~, I9 N8 b" U/ c [; A. i- m
6 ^$ d' p7 w8 b6 ?8 L
import org.apache.lucene.analysis.Analyzer;# S1 X) H1 G* q3 _ n7 `
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;0 K& B5 i9 k' |- D( M& S
import org.apache.lucene.analysis.standard.StandardAnalyzer;5 T$ n& P' {' o* [6 ~, o
import org.apache.lucene.document.Document;
" S' y! L( X% s. x9 P& m4 k) \# himport org.apache.lucene.document.Field;0 s( ?+ E9 i0 B& w1 [8 Z, v2 T
import org.apache.lucene.document.IntField;
1 ?7 s9 t" X* C0 n" Dimport org.apache.lucene.document.StringField;1 G4 |" b: W9 @9 T
import org.apache.lucene.document.TextField;0 A2 i& J \+ k2 `
import org.apache.lucene.index.IndexWriter;
# N4 o4 B' e7 F' C$ s7 ^1 Nimport org.apache.lucene.index.IndexWriterConfig;( X6 g) d8 ^ d* m s; y8 s
import org.apache.lucene.store.Directory;2 ]8 P1 F$ Q. T9 I- |$ |0 L1 H9 Q
import org.apache.lucene.store.FSDirectory;& q% q. @4 b5 W$ t* S) s# K+ o
# _* k0 G& R! E, Z/ V8 `
public class Indexer {) c8 |, e& R/ |0 [$ d
9 ^& k5 a& ^9 O) b' a) m private Integer ids[]={1,2,3};/ g- P( S0 Y0 t# V9 ], ]; N0 I
private String citys[]={"青岛","南京","上海"};
( d; O& i. K# ^ private String descs[]={
( e& ~! Z; I1 z5 h% z/ d* v- d# w "青岛是一个美丽的城市。",
9 E7 I9 K4 ~* t, T, ~4 L0 F1 R$ K "南京是一个有文化的城市。南京是一个文化的城市南京,简称宁,是江苏省会,地处中国东部地区,长江下游,濒江近海。全市下辖11个区,总面积6597平方公里,2013年建成区面积752.83平方公里,常住人口818.78万,其中城镇人口659.1万人。[1-4] “江南佳丽地,金陵帝王州”,南京拥有着6000多年文明史、近2600年建城史和近500年的建都史,是中国四大古都之一,有“六朝古都”、“十朝都会”之称,是中华文明的重要发祥地,历史上曾数次庇佑华夏之正朔,长期是中国南方的政治、经济、文化中心,拥有厚重的文化底蕴和丰富的历史遗存。[5-7] 南京是国家重要的科教中心,自古以来就是一座崇文重教的城市,有“天下文枢”、“东南第一学”的美誉。截至2013年,南京有高等院校75所,其中211高校8所,仅次于北京上海;国家重点实验室25所、国家重点学科169个、两院院士83人,均居中国第三。[8-10] 。",: C" u- t4 k2 ?( H
"上海是一个繁华的城市。"6 e+ n* q I" R
};2 F7 l& _- r( v2 \/ i& s/ B/ D; A9 H( R
+ X7 l0 y0 z0 Q, _
private Directory dir;9 K! \/ i7 z1 C0 k
# L- h; _/ I( I# `/ i6 t6 Y
/**8 z8 ]$ z" `+ h# |4 q7 _+ N
* 获取IndexWriter实例) e% ^+ s. o0 C. t$ R
* @return1 }# ~6 ]+ m6 S( D
* @throws Exception
% _1 }: D1 R7 d( X+ a# G */
* T$ u: R9 `- b8 B) p x7 m& [ private IndexWriter getWriter()throws Exception{
; x. a F; B, G! T //Analyzer analyzer=new StandardAnalyzer(); // 标准分词器$ x# Y' p$ F% c! w0 y
SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();2 {6 ]/ d% S7 a" s
IndexWriterConfig iwc=new IndexWriterConfig(analyzer);7 y: A( @) e$ Q+ p
IndexWriter writer=new IndexWriter(dir, iwc);
' J5 s8 |5 u9 m% c: u3 ` return writer;
7 K9 u( A/ N2 `6 i6 X }& Q S: Q6 [* l" P5 ?2 H% h
4 ?7 D. k5 |+ E K" { /**
1 }: f' M4 [/ v8 g * 生成索引
, D. v3 }# r% X2 T% w2 I0 Y * @param indexDir5 P# r V1 J2 g& G- F
* @throws Exception
. o d/ @- l7 t2 N: J: t! o */
; x% J) @' G. ? private void index(String indexDir)throws Exception{
0 V8 m0 T1 ?, i& n1 j G dir=FSDirectory.open(Paths.get(indexDir));$ J. J$ J9 R3 P. o8 k
IndexWriter writer=getWriter();6 k8 O& c( S' X k# O- _
for(int i=0;i<ids.length;i++){
: {8 A, A! Q; I5 S. W4 m( s7 o0 L Document doc=new Document();
# f+ C+ Z- o0 { doc.add(new IntField("id", ids, Field.Store.YES));
v% G5 V$ Z! r* Q% Q( O4 S doc.add(new StringField("city",citys,Field.Store.YES));
V3 a" q* J6 `% M% X7 M: o doc.add(new TextField("desc", descs, Field.Store.YES));
( v, ~. i+ s" E* _ writer.addDocument(doc); // 添加文档
5 k4 P l/ ]1 l% E+ H/ N9 z }
- ]1 Y G0 Y. z writer.close();
@" [* @% N6 D3 Q! P4 H" I# m }
1 d. H( h' P1 ?/ T5 v3 i+ I! U0 u8 \' z( V9 x* S
' t8 F$ n8 ~' W9 @ @9 x1 Z Q4 R public static void main(String[] args) throws Exception {0 F8 m1 i: W! |, @; G( B
new Indexer().index("D:\\lucene6");" V, S7 t z: G Y* `8 S
}
0 W. k# e* X" j! _2 _% D% r) b. l4 G* m" Z/ u. `6 m$ D( T0 W
}
! s2 g: z/ _# c& d6 n) g
/ b% v8 V5 ]+ ]2 P3 X6 _* P
" |9 p- F# ^/ U$ ~5 j6 E& p) N e5 |
(2)8 ^" n U* j+ B: j+ W, I& n1 H' w
import java.io.StringReader;3 K% r) S" @# a, ]
import java.nio.file.Paths;! N; R+ s& L. V
) `6 w6 S7 X, `7 ]" H2 ?1 \4 R0 f) Kimport org.apache.lucene.analysis.Analyzer;
8 I' {, T. H9 R7 y# Z9 gimport org.apache.lucene.analysis.TokenStream;/ P2 v& u T. f
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
: ]6 r/ u* h) f) O5 S @import org.apache.lucene.analysis.standard.StandardAnalyzer;& o/ k" `, ~" k4 @% z+ D
import org.apache.lucene.document.Document;/ U! h/ O( N) q9 h
import org.apache.lucene.index.DirectoryReader;- y. Q& u% b7 k
import org.apache.lucene.index.IndexReader;0 u9 ? y" d: i; I
import org.apache.lucene.queryparser.classic.QueryParser;: _6 v! Y6 Z+ I
import org.apache.lucene.search.IndexSearcher;4 d6 S" g( Y o& F3 n
import org.apache.lucene.search.Query;
! `! U! e+ ] X* e0 Simport org.apache.lucene.search.ScoreDoc;2 j* q v0 I0 d& d6 W0 |" @
import org.apache.lucene.search.TopDocs;
0 D& u9 v; s" B2 T* Oimport org.apache.lucene.search.highlight.Fragmenter;* R! P* b* |0 \0 Y, u' |! G @
import org.apache.lucene.search.highlight.Highlighter;
& W. T$ h$ `, N( z' i- Limport org.apache.lucene.search.highlight.QueryScorer;& g& [9 |9 e, T g# ?
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;3 {* ~, ?+ _3 E, c6 \5 g7 Y; v
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;$ j: E _3 q# ` _ o
import org.apache.lucene.store.Directory;
0 G/ D+ `( F, @- D* Z, p1 fimport org.apache.lucene.store.FSDirectory;
5 {; o4 J# @( O i! N5 |( ?
, b7 m# c! p$ \; [8 ] Ppublic class Searcher {
* i2 d* \& c3 z
% e# h" _% y0 C* ` public static void search(String indexDir,String q)throws Exception{
' y/ `& T0 H* m) N( ~( @ Directory dir=FSDirectory.open(Paths.get(indexDir));
. t( q0 s5 [, t, w6 }5 P IndexReader reader=DirectoryReader.open(dir);
, V& P. N% k6 x2 `! ^+ l IndexSearcher is=new IndexSearcher(reader);
; @% ~9 T) p) I& P2 D/ V // Analyzer analyzer=new StandardAnalyzer(); // 标准分词器8 ^8 I! c5 c X) `
SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();! U+ D! q9 j' s/ \8 v0 R" L
QueryParser parser=new QueryParser("desc", analyzer);% U; Q( w( o0 [9 N
Query query=parser.parse(q);# t1 F" a2 K0 |3 g% i
long start=System.currentTimeMillis();
1 s' X* y6 }3 M: }4 b) K TopDocs hits=is.search(query, 10);, J9 N& O- W3 K8 y9 D9 W
long end=System.currentTimeMillis();
* H2 p5 y/ ]# A. k0 D System.out.println("匹配 "+q+" ,总共花费"+(end-start)+"毫秒"+"查询到"+hits.totalHits+"个记录");' r) b+ H4 O% A8 ~2 ?* K
q! Q7 n% i1 U( j6 f3 D: V
QueryScorer scorer=new QueryScorer(query);+ _2 r! ?0 R$ u# E# q0 d
Fragmenter fragmenter=new SimpleSpanFragmenter(scorer);+ }5 H* _3 _ Y% O7 b: |; n6 ?
SimpleHTMLFormatter simpleHTMLFormatter=new SimpleHTMLFormatter("<b><font color='red'>","</font></b>");
( Y5 W M! L% [4 E$ [4 z$ t4 U Highlighter highlighter=new Highlighter(simpleHTMLFormatter, scorer);0 A- X& V) [! O( L; c
highlighter.setTextFragmenter(fragmenter);
5 c- E- e- U2 _! X for(ScoreDoc scoreDoc:hits.scoreDocs){0 }! ?# j) R _' O+ A8 F
Document doc=is.doc(scoreDoc.doc);; i% x) u" F8 E( {+ e
System.out.println(doc.get("city"));
% ]+ R4 f" W+ w6 ^: s, R System.out.println(doc.get("desc"));
5 q) K9 \ ~( d3 e0 s" D3 d- M String desc=doc.get("desc");* v0 z U8 x) {- G+ U5 L
if(desc!=null){
4 [+ k: K* Z6 L* ^ TokenStream tokenStream=analyzer.tokenStream("desc", new StringReader(desc));, K7 t( E7 K; E- M! C+ \0 _
System.out.println(highlighter.getBestFragment(tokenStream, desc));
1 Z# {* h3 H1 a }9 u) a6 p9 y7 c8 }5 f
}$ E* V: H& E2 n* E6 e' Q
reader.close();' X. u B! ]8 V
}
- G8 e, ~6 R- d E
# ?3 L) Z# O$ ^) [3 F, A" N, O public static void main(String[] args) {3 m. e7 F+ k' ]2 G/ `) k
String indexDir="D:\\lucene6";
# f# h$ N" a _1 Z: n( f& e String q="南京文明";# H) l6 W7 j: E, s! ]
try {
- E& h# y+ c p, F. ^ search(indexDir,q);
5 _5 X* @/ S5 V, y2 B& B' z' z } catch (Exception e) {. ?0 Z1 z) M+ A% J
// TODO Auto-generated catch block
* n a# e9 P3 J; q$ ~ e.printStackTrace();
- d6 H+ d, P8 I; C }2 h( I" ]) x7 @/ m7 t; B
}
$ R2 s" d: p* h}0 R$ K. A$ c! x: \( h$ T" S) L
3 m' I% Q$ k: b3 r8 K! T' M. A) _
5 ~0 ?! e; I& ~4 l ]" f- Y5 L9 ^/ q+ D
) ~# \3 M" Y3 q7 a6 s
, `. @7 x! r' h
|
|