--- core/src/dualist/pipes/DocumentPipe.java.orig 2012-02-11 05:07:28.000000000 +0900 +++ core/src/dualist/pipes/DocumentPipe.java 2012-02-22 22:44:45.000000000 +0900 @@ -13,6 +13,8 @@ import cc.mallet.pipe.TokenSequenceRemoveStopwords; import cc.mallet.types.Instance; +import dualist.pipes.SimpleMecabPipe; + public class DocumentPipe extends Pipe { private Pipe myPipe = new SerialPipes(new Pipe[] { @@ -24,6 +26,9 @@ new CharSequenceReplace(Pattern.compile("&(.*?);"), ""), new CharSequenceReplace(Pattern.compile("[0-9]+"), "00"), new CharSequenceLowercase(), + (System.getProperty("dualist.lang") != null && + System.getProperty("dualist.lang").equals("ja")) ? + new SimpleMecabPipe() : // new CharSequence2TokenSequence(CharSequenceLexer.LEX_WORD_CLASSES), new CharSequence2TokenSequence("[\\p{L}\\p{Mn}]+"), new TokenSequenceRemoveStopwords(), --- build.xml.orig 2012-03-08 23:07:56.000000000 +0900 +++ build.xml 2012-03-09 09:32:14.000000000 +0900 @@ -26,7 +26,7 @@ - +