Initial commit

derwinmcgeary · derwinmcgeary · commit 94f4bf93f93f · 2015-12-20T17:25:23.000Z
diff --git a/DSCapstone.Rproj b/DSCapstone.Rproj
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
diff --git a/README.md b/README.md
@@ -0,0 +1,4 @@
+# Data Science Capstone Project
+This is my repo for work towards the capstone project. If you're already looking at GitHub,
+this might actually be interesting for you. I'm using it mostly so that I can run large scripts
+on a more powerful server.
diff --git a/analyser.R b/analyser.R
@@ -0,0 +1,12 @@
+library("tm")
+library("RWeka")
+massive <- readLines("final/en_US/en_US.news.txt", n=10000)
+myCorpus <- Corpus(VectorSource(massive))
+myCorpus <- tm_map(myCorpus, tolower)
+myCorpus <- tm_map(myCorpus, removeNumbers)
+myCorpus <- tm_map(myCorpus, removePunctuation)
+myCorpus <- tm_map(myCorpus, PlainTextDocument)
+print("OK")
+BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
+UnigramTokenizer <- function(x) WordTokenizer(x)
+tdm <- TermDocumentMatrix(myCorpus, control = list(tokenize = UnigramTokenizer))
diff --git a/predictor.R b/predictor.R
@@ -0,0 +1,25 @@
+# Plan: create "Stupid Backoff" predictor
+# Do this by getting ngrams up to 3, then search the last two words entered and return the most likely,
+# if there are none, search the last one word entered and return the next one, if none, return most
+# likely word (which is lame)
+
+# When Stupid Backoff is functional, try contextually-aware selector: select which Stupid Backoff model
+# by analysing which corpus the user is most likely employing. Do so by finding a selection of
+# 1- or 2-grams which are characteristic of either tweets, blogs, or news. Then return the appropriate
+# word from that model (or re-weight models and return a word from the meta-model)
+
+# The create*grams functions should be once on the big server then the data should be loaded
+# I might make them caching so that they work either way
+
+create3grams <- function(corp) {
+  
+}
+
+create2grams <- function(corp) {
+  
+}
+
+create1grams <- function(corp) {
+  
+}
+