java-plsa
Package provides the java implementation of scoreabilistic latent semantic analysis (pLSA)
Install
Add the following dependency to your POM file:
<dependency>
<groupId>com.github.chen0040</groupId>
<artifactId>java-plsa</artifactId>
<version>1.0.1</version>
</dependency>
Usage
The sample code belows illustrates how to perform topic modelling using pLSA
List<String> docs = Arrays.asList("[doc-1-content]", "[doc-2-content]", ...);
pLSA method = new pLSA();
method.setStemmerEnabled(true);
method.setMaxIters(10);
method.setMaxVocabularySize(1000);
method.fit(docs);
for(int topic = 0; topic < method.getTopicCount(); ++topic){
List<TupleTwo<Document, Double>> topRankedDocs = method.getTopRankingDocs4Topic(topic, 3);
List<TupleTwo<String, Double>> topRankedWords = method.getTopRankingWords4Topic(topic, 3);
System.out.println("Topic "+topic+": ");
System.out.println("Top Ranked Document:");
for(TupleTwo<Document, Double> entry : topRankedDocs){
Document doc = entry._1();
double score = entry._2();
System.out.print(doc.docIndex()+"(" + score +"), ");
System.out.println(doc.content());
}
System.out.println();
System.out.println("Top Ranked Words:");
for(TupleTwo<String, Double> entry : topRankedWords){
String word = entry._1();
double score = entry._2();
System.out.print(word+"(" + score +"), ");
}
System.out.println();
}
System.out.println("// ============================================= //");
for(int doc = 0; doc < method.getDocCount(); ++doc){
List<TupleTwo<Integer, Double>> topRankedTopics = method.getTopRankingTopics4Doc(doc, 3);
System.out.print("Doc "+doc+": ");
for(TupleTwo<Integer, Double> entry : topRankedTopics){
int topic = entry._1();
double score = entry._2();
System.out.print(topic+"(" + score + "), ");
}
System.out.println();
}