All:
I am beginner in R topic modeling, it all started three weeks ago. So my problem is I can successfully processed my data into corpus, Document term matrix and LDA function. I have tweets as my input and about 460,000 tweets. But I am not happy with the result, the words across all topic are very similar.
packages <- c('tm','topicmodels','SnowballC','RWeka','rJava')if (length(setdiff(packages, rownames(installed.packages()))) > 0) {install.packages(setdiff(packages, rownames(installed.packages()))) }options( java.parameters = "-Xmx4g" )library(tm)library(topicmodels)library(SnowballC)library(RWeka)print("Please select the input file");flush.console();ifilename <- file.choose();raw_data=scan(ifilename,'string',sep="\n",skip=1);tweet_data=raw_data;rm(raw_data);tweet_data = gsub("(RT|via)((?:\\b\\W*@\\w+)+)","",tweet_data)tweet_data = gsub("http[^[:blank:]]+", "", tweet_data)tweet_data = gsub("@\\w+", "", tweet_data)tweet_data = gsub("[ \t]{2,}", "", tweet_data)tweet_data = gsub("^\\s+|\\s+$", "", tweet_data)tweet_data = gsub('\\d+', '', tweet_data)tweet_data = gsub("[[:punct:]]", "", tweet_data)max_size=5000;data_size=length(tweet_data);itinerary=ceiling(data_size[1]/max_size);if (itinerary==1){pre_data=tweet_data}else {pre_data=tweet_data[1:max_size]}corp <- Corpus(VectorSource(pre_data));corp<-tm_map(corp,tolower);corp<-tm_map(corp,removePunctuation);extend_stop_word=c('description:','null','text:','description','url','text','aca','obama','romney','ryan','mitt','conservative','liberal');corp<-tm_map(corp,removeNumbers);gc();IteratedLovinsStemmer(corp, control = NULL)gc();corp<-tm_map(corp,removeWords,c(stopwords('english'),extend_stop_word));gc();corp <- tm_map(corp, PlainTextDocument)gc();dtm.control = list(tolower= F,removePunctuation=F,removeNumbers= F, stemming= F, minWordLength = 3,weighting= weightTf,stopwords=F)dtm = DocumentTermMatrix(corp, control=dtm.control)gc();#dtm = removeSparseTerms(dtm,0.99)dtm = dtm[rowSums(as.matrix(dtm))>0,]gc();best.model <- lapply(seq(2,50, by=2), function(k){LDA(dtm[1:10,], k)})gc();best.model.logLik <- as.data.frame(as.matrix(lapply(best.model, logLik)))best.model.logLik.df <- data.frame(topics=c(seq(2,50, by=2)), LL=as.numeric(as.matrix(best.model.logLik)))k=best.model.logLik.df[which.max(best.model.logLik.df$LL),1];cat("Best topic number is k=",k);flush.console();gc();lda.model = LDA(dtm, k,method='VEM')gc();write.csv(terms(lda.model,50), file = "terms.csv");lda_topics=topics(lda.model,1);
The following is the results I get:
> terms(lda.model,10) Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 [1,] "taxes""medicare""tax""tax""jobs" [2,] "pay""will""returns""returns""plan" [3,] "welfare""tax""gop""taxes""gop" [4,] "will""care""taxes""will""military" [5,] "returns""can""abortion""paul""will" [6,] "plan""laden""can""medicare""tax" [7,] "economy""vote""tcot""class""paul" [8,] "budget""economy""muslim""budget""campaign" [9,] "president""taxes""campaign""says""says"[10,] "reid""just""economy""cuts""can" Topic 6 Topic 7 Topic 8 Topic 9 [1,] "medicare""tax""medicare""tax" [2,] "taxes""medicare""tax""president" [3,] "plan""taxes""jobs""jobs" [4,] "tcot""tcot""tcot""taxes" [5,] "budget""president""foreign""medicare" [6,] "returns""jobs""plan""tcot" [7,] "welfare""budget""will""paul" [8,] "can""energy""economy""health" [9,] "says""military""bush""people"[10,] "obamacare""want""now""gop" Topic 10 Topic 11 Topic 12 [1,] "tax""gop""gop" [2,] "medicare""tcot""plan" [3,] "tcot""military""tax" [4,] "president""jobs""taxes" [5,] "gop""energy""welfare" [6,] "plan""will""tcot" [7,] "jobs""ohio""military" [8,] "will""abortion""campaign" [9,] "cuts""paul""class"[10,] "paul""budget""just"
As you can see the words "tax""medicare" are across all topic. I noticed that while I playing with the dtm = removeSparseTerms(dtm,0.99)
the results may changes a little. And the following is my sample input data
> tweet_data[1:10] [1] " While Romney friends get richer MT Romney Ryan Economic Plans Would Increase Unemployment Deepen Recession" [2] "Wayne Allyn Root claims proof of Obama s foreign citizenship During a radio show interview Resist" [3] " President Obama Chief Investor Leave Energy Upgrades to the Businesses Reading President Obama誷 latest Execu " [4] " Brotherhood starts crucifixions Opponents of Egypt s Muslim president executed naked on trees Obama s tcot" [5] " Say you stand with President Obama裻he candidate in this election who trusts women to make their own health decisions " [6] " Romney Ryan Descend Into Medicare Gibberish " [7] "Maddow Romney demanded opponents tax returns and lied about residency in The Raw Story" [8] "Is it not grand How can Jews reconcile Obama Carter s treatment of Jews Israel How ca " [9] " The Tax Returns are Hurting Romney Badly "[10] " Replacing Gen Dempsey is cruicial to US security Dempsey disappointed by anti Obama campaign by ex military members h "
Please Help!!Thanks!