#################################################
# Loren Collingwood, University of Washington	#
# UNC -- RTextTools workshop					#
# Lecture 3										#
# Labeling U.S. Congres data					#
#################################################

#install.packags("RTextTools") #Needs to be R-2.14.1 or greater
library(RTextTools)
#Set working directory
#setwd("/Users/lorencollingwood/Documents/consulting/rtexttools/unc_workshop")

list.files()
###########################################
#install.packages("gmodels")
library(gmodels)

congress <- read_data(system.file("data/USCongress.csv.gz",package="RTextTools"),type="csv")
head(congress)
dim(congress)

# Let's look at our topic distribution
CrossTable(congress$major)

# Randomize Data [OPTIONAL]
set.seed(999)
congress <- congress[sample(1:nrow(congress),size=nrow(congress),replace=FALSE),]

###########################################
# 		Document Term Matrix Creation	  #
###########################################

# WE WILL TRAIN ON THE "text" AND "cong" COLUMNS
congress_matrix <- create_matrix(cbind(congress$text,congress$cong), language="english", 
			removeNumbers=TRUE, stemWords=TRUE, weighting=weightTfIdf)
congress_matrix

###########################################
#		Corpus and Container Creation     #
###########################################
# Train on first 4000, test on 4001:4449
corpus <- create_corpus(congress_matrix,congress$major,trainSize=1:4000, testSize=4001:4449, 
		virgin=FALSE)
names(attributes(corpus))

###########################################
#		 	   Train Models     		  #
###########################################

# Create Learners
models <- train_models(corpus, algorithms=c("SVM","MAXENT")) #Make take some time.

###########################################
#		      Classify Data     		  #
###########################################

results <- classify_models(corpus, models)

######################################
# 				Analytics			 #
######################################

analytics <- create_analytics(corpus, results)

# Summary of Algorithm Accuracy
analytics@algorithm_summary

# Plot SVM Recall to see which labels are poor
x <- as.numeric(rownames(analytics@algorithm_summary))[-20]
y <- analytics@algorithm_summary$SVM_RECALL[-20]

plot(x, y, type="l", lwd=3, main="Support Vector Machine Topic Accuracy", ylab="Recall Accuracy", xlab="Topic")
abline(h=.75, lwd=2, col="maroon")
text(x, y, adj=1.2)

# Summary of Label Accuracy
analytics@label_summary
head(analytics@document_summary)

# Confusion Matrices -- look for possible problems
table(true = analytics@document_summary$MANUAL_CODE, predict = analytics@document_summary$CONSENSUS_CODE)
table(true = analytics@document_summary$MANUAL_CODE, predict = analytics@document_summary$PROBABILITY_CODE)
table(true = analytics@document_summary$MANUAL_CODE, predict = analytics@document_summary$SVM_LABEL)
table(true = analytics@document_summary$MANUAL_CODE, predict = analytics@document_summary$MAXENTROPY_LABEL)

# CHECK OVERALL ACCURACY OF ALGORITHMS
recall_accuracy (analytics@document_summary$MANUAL_CODE, analytics@document_summary$CONSENSUS_CODE)
recall_accuracy (analytics@document_summary$MANUAL_CODE, analytics@document_summary$PROBABILITY_CODE)
recall_accuracy (analytics@document_summary$MANUAL_CODE, analytics@document_summary$SVM_LABEL)
recall_accuracy (analytics@document_summary$MANUAL_CODE, analytics@document_summary$MAXENTROPY_LABEL)

###########################
# 	 Ensemble Agreement   #
###########################

# Set standard of accuracy. If everything needs to be at over 80%, then you will need to 
# manually code the documents that do not reach that.

analytics@ensemble_summary

final_data <- analytics@document_summary
final_data$recall_ensemble[final_data$CONSENSUS_AGREE == 2] <- "88% recall"
final_data$recall_ensemble[final_data$CONSENSUS_AGREE == 1] <- "72% recall"
head(final_data)

# Write out data for analysis in your favorite statistics program (aka Excel)
getwd()
write.csv(final_data, "congress_coded.csv", row.names=FALSE)

rm ( list=ls() )