R Script
# change to folders containing datasets
setwd("~/Downloads")
# read in the datasets (set strings as factors to False due to text variables)
NewsTrain = read.csv("NYTimesBlogTrain.csv", stringsAsFactors=FALSE)
NewsTest = read.csv("NYTimesBlogTest.csv", stringsAsFactors=FALSE)
# see the structure
#str(NewsTrain)
#str(NewsTest)
# convert newsdesk, section and subsection names to factor variables
NewsTrain$NewsDesk = as.factor(NewsTrain$NewsDesk)
NewsTrain$SectionName = as.factor(NewsTrain$SectionName)
NewsTrain$SubsectionName = as.factor(NewsTrain$SubsectionName)
# test set might have different set of levels
NewsTest$NewsDesk = factor(NewsTest$NewsDesk, levels=levels(NewsTrain$NewsDesk))
NewsTest$SectionName = factor(NewsTest$SectionName, levels=levels(NewsTrain$SectionName))
NewsTest$SubsectionName = factor(NewsTest$SubsectionName, levels=levels(NewsTrain$SubsectionName))
# convert pubdate as date time variable
NewsTrain$PubDate = strptime(NewsTrain$PubDate, "%Y-%m-%d %H:%M:%S")
NewsTest$PubDate = strptime(NewsTest$PubDate, "%Y-%m-%d %H:%M:%S")
# extract weekday and month and convert to factor
NewsTrain$Weekday = NewsTrain$PubDate$wday
NewsTest$Weekday = NewsTest$PubDate$wday
NewsTrain$Month = NewsTrain$PubDate$mon
NewsTest$Month = NewsTest$PubDate$mon
NewsTrain$Hour = NewsTrain$PubDate$hour
NewsTest$Hour = NewsTest$PubDate$hour
NewsTrain$Minute = NewsTrain$PubDate$min
NewsTest$Minute = NewsTest$PubDate$min
NewsTrain$Weekday = as.factor(NewsTrain$Weekday)
NewsTest$Weekday = as.factor(NewsTest$Weekday)
NewsTrain$Month = as.factor(NewsTrain$Month)
NewsTest$Month = as.factor(NewsTest$Month)
NewsTrain$Hour = as.factor(NewsTrain$Hour)
NewsTest$Hour = as.factor(NewsTest$Hour)
NewsTrain$Minute = as.factor(NewsTrain$Minute)
NewsTest$Minute = as.factor(NewsTest$Minute)
# see the summary after the modifications
#summary(NewsTrain)
#summary(NewsTest)
# start text processing with Headlines
library(tm)
CorpusHeadline = Corpus(VectorSource(c(NewsTrain$Headline, NewsTest$Headline)))
CorpusHeadline = tm_map(CorpusHeadline, tolower)
CorpusHeadline = tm_map(CorpusHeadline, PlainTextDocument)
CorpusHeadline = tm_map(CorpusHeadline, removePunctuation)
CorpusHeadline = tm_map(CorpusHeadline, removeWords, c(stopwords("english"), "new", "york", "day", "today", "week", "report"))
#CorpusHeadline = tm_map(CorpusHeadline, removeWords, stopwords("english"))
CorpusHeadline = tm_map(CorpusHeadline, stemDocument)
CorpusSnippet = Corpus(VectorSource(c(NewsTrain$Snippet, NewsTest$Snippet)))
CorpusSnippet = tm_map(CorpusSnippet, tolower)
CorpusSnippet = tm_map(CorpusSnippet, PlainTextDocument)
CorpusSnippet = tm_map(CorpusSnippet, removePunctuation)
CorpusSnippet = tm_map(CorpusSnippet, removeWords, c(stopwords("english"), "new", "york", "will", "week", "year"))
#CorpusSnippet = tm_map(CorpusSnippet, removeWords, stopwords("english"))
CorpusSnippet = tm_map(CorpusSnippet, stemDocument)
CorpusAbstract = Corpus(VectorSource(c(NewsTrain$Abstract, NewsTest$Abstract)))
CorpusAbstract = tm_map(CorpusAbstract, tolower)
CorpusAbstract = tm_map(CorpusAbstract, PlainTextDocument)
CorpusAbstract = tm_map(CorpusAbstract, removePunctuation)
CorpusAbstract = tm_map(CorpusAbstract, removeWords, c(stopwords("english"), "new", "york", "will", "can", "week", "year"))
#CorpusAbstract = tm_map(CorpusAbstract, removeWords, stopwords("english"))
CorpusAbstract = tm_map(CorpusAbstract, stemDocument)
library(wordcloud)
tdm = TermDocumentMatrix(c(CorpusHeadline))
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
pal <- brewer.pal(8, "Dark2")
png("headlines.png", width=1280,height=800)
wordcloud(d$word,d$freq, scale=c(8,.3),min.freq=2,max.words=100, random.order=T, rot.per=.15, colors=pal, vfont=c("sans serif","plain"))
dev.off()
tdm = TermDocumentMatrix(c(CorpusSnippet))
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
pal <- brewer.pal(8, "Dark2")
png("snippet.png", width=1280,height=800)
wordcloud(d$word,d$freq, scale=c(8,.3),min.freq=2,max.words=100, random.order=T, rot.per=.15, colors=pal, vfont=c("sans serif","plain"))
dev.off()
tdm = TermDocumentMatrix(c(CorpusAbstract))
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
pal <- brewer.pal(8, "Dark2")
png("abstract.png", width=1280,height=800)
wordcloud(d$word,d$freq, scale=c(8,.3),min.freq=2,max.words=100, random.order=T, rot.per=.15, colors=pal, vfont=c("sans serif","plain"))
dev.off()
# generate columns from Headline
dtm = DocumentTermMatrix(CorpusHeadline)
sparse = removeSparseTerms(dtm, 0.99)
Words = as.data.frame(as.matrix(sparse))
colnames(Words) = colnames(Words)
WordsTrain = head(Words, nrow(NewsTrain))
WordsTest = tail(Words, nrow(NewsTest))
WordsTrain$Popular = NewsTrain$Popular
WordsTrain$WordCount = NewsTrain$WordCount
WordsTest$WordCount = NewsTest$WordCount
WordsTrain$NewsDesk = NewsTrain$NewsDesk
WordsTest$NewsDesk = NewsTest$NewsDesk
WordsTrain$SectionName = NewsTrain$SectionName
WordsTest$SectionName = NewsTest$SectionName
WordsTrain$SubsectionName = NewsTrain$SubsectionName
WordsTest$SubsectionName = NewsTest$SubsectionName
WordsTrain$Weekday = NewsTrain$Weekday
WordsTest$Weekday = NewsTest$Weekday
WordsTrain$Hour = NewsTrain$Hour
WordsTest$Hour = NewsTest$Hour
library(randomForest)
# create random forest
WordsRF = randomForest(WordsTrain[,-which(names(WordsTrain)=="Popular")], WordsTrain$Popular, mtry = 10, ntree=2000, importance = TRUE, keep.forest = TRUE, do.trace = 100)
PredTrain = predict(WordsRF)
tab = table(WordsTrain$Popular,PredTrain > 0.5)
tab
sum(tab[row(tab)==col(tab)])/sum(tab)
library(ROCR)
pred = prediction(PredTrain, WordsTrain$Popular)
as.numeric(performance(pred, "auc")@y.values)
PredTest = predict(WordsRF, newdata=WordsTest)
PredTest_H = ifelse(PredTest < 0, 0, PredTest)
#################
# generate columns from Snippet
dtm = DocumentTermMatrix(CorpusSnippet)
sparse = removeSparseTerms(dtm, 0.975)
Words = as.data.frame(as.matrix(sparse))
colnames(Words) = colnames(Words)
WordsTrain = head(Words, nrow(NewsTrain))
WordsTest = tail(Words, nrow(NewsTest))
WordsTrain$Popular = NewsTrain$Popular
WordsTrain$WordCount = NewsTrain$WordCount
WordsTest$WordCount = NewsTest$WordCount
WordsTrain$NewsDesk = NewsTrain$NewsDesk
WordsTest$NewsDesk = NewsTest$NewsDesk
WordsTrain$SectionName = NewsTrain$SectionName
WordsTest$SectionName = NewsTest$SectionName
WordsTrain$SubsectionName = NewsTrain$SubsectionName
WordsTest$SubsectionName = NewsTest$SubsectionName
WordsTrain$Weekday = NewsTrain$Weekday
WordsTest$Weekday = NewsTest$Weekday
WordsTrain$Hour = NewsTrain$Hour
WordsTest$Hour = NewsTest$Hour
# create random forest
WordsRF = randomForest(WordsTrain[,-which(names(WordsTrain)=="Popular")], WordsTrain$Popular, mtry = 10, ntree=2000, importance = TRUE, keep.forest = TRUE, do.trace = 100)
PredTrain = predict(WordsRF)
tab = table(WordsTrain$Popular,PredTrain > 0.5)
tab
sum(tab[row(tab)==col(tab)])/sum(tab)
library(ROCR)
pred = prediction(PredTrain, WordsTrain$Popular)
as.numeric(performance(pred, "auc")@y.values)
PredTest = predict(WordsRF, newdata=WordsTest)
PredTest_S = ifelse(PredTest < 0, 0, PredTest)
#############################
# generate columns from Abstract
dtm = DocumentTermMatrix(CorpusAbstract)
sparse = removeSparseTerms(dtm, 0.975)
Words = as.data.frame(as.matrix(sparse))
colnames(Words) = paste("S_",colnames(Words))
WordsTrain = head(Words, nrow(NewsTrain))
WordsTest = tail(Words, nrow(NewsTest))
WordsTrain$Popular = NewsTrain$Popular
WordsTrain$WordCount = NewsTrain$WordCount
WordsTest$WordCount = NewsTest$WordCount
WordsTrain$NewsDesk = NewsTrain$NewsDesk
WordsTest$NewsDesk = NewsTest$NewsDesk
WordsTrain$SectionName = NewsTrain$SectionName
WordsTest$SectionName = NewsTest$SectionName
WordsTrain$SubsectionName = NewsTrain$SubsectionName
WordsTest$SubsectionName = NewsTest$SubsectionName
WordsTrain$Weekday = NewsTrain$Weekday
WordsTest$Weekday = NewsTest$Weekday
WordsTrain$Hour = NewsTrain$Hour
WordsTest$Hour = NewsTest$Hour
# create random forest
WordsRF = randomForest(WordsTrain[,-which(names(WordsTrain)=="Popular")], WordsTrain$Popular, mtry = 10, ntree=2000, importance = TRUE, keep.forest = TRUE, do.trace = 100)
PredTrain = predict(WordsRF)
tab = table(WordsTrain$Popular,PredTrain > 0.5)
tab
sum(tab[row(tab)==col(tab)])/sum(tab)
library(ROCR)
pred = prediction(PredTrain, WordsTrain$Popular)
as.numeric(performance(pred, "auc")@y.values)
PredTest = predict(WordsRF, newdata=WordsTest)
PredTest_A = ifelse(PredTest < 0, 0, PredTest)
################
PredTestFinal = rowMeans(data.frame(PredTest_H, PredTest_S, PredTest_A))
MySubmissionFinal = data.frame(UniqueID = NewsTest$UniqueID, Probability1 = PredTestFinal)
write.csv(MySubmissionFinal, "SubmissionRF_Final3.csv", row.names=FALSE)
Word Clouds