TOKEN_MGR_DECLS : { /** use MYTokenizer to process cjk character */ //新增此行注释 private MYTokenizer myTokenizer = null; //新增此行 /** a global cjk token */ //新增此行注释 private org.apache.lucene.analysis.Token cjkToken = null; //新增此行,输出流
/** start offset of cjk sequence */ //新增此行注释 private int cjkStartOffset = 0; //新增此行
/** Constructs a token manager for the provided Reader. */ public NutchAnalysisTokenManager(Reader reader) {
文件第106行附近: }
// chinese, japanese and korean characters | <SIGRAM: <CJK> > //删除此行 | <SIGRAM: (<CJK>)+ > //新增此行(#行)
//以下所有行均为新增,紧接上行代码(#行)书写 { /** * use an instance of myTokenizer, myTokenizer, hold the maximum * matched cjk chars, and cjkToken for the current token; * reset matchedToken.image Text(); * reset matchedToken.beginColumn use cjkToken.startOffset(); * dColumn dOffset(); * backup the last char when the next cjkToken is valid. */ if(myTokenizer == null) { myTokenizer = new MYTokenizer (new String())); cjkStartOffset = matchedToken.beginColumn; try { cjkToken = (); } catch(IOException ioe) { cjkToken = null; } }