#remove punctuation and numbers from description column
description_vector <- gsub("[[:punct::]]", "", description_vector)
description_vector <- gsub("[[:digit:]]", "", description_vector)
#remove punctuation
description_vector <- gsub("[!.!»?,-]","", description_vector)
#create a Corpus object
description_corpus <- Corpus(VectorSource(description_vector))
#create tokens from individual words and phrases in description colum n
#make everything lowercase
description_corpus <- tm_map(description_corpus, content_transformer(tolower))
#remove stopwords
description_corpus <- tm_map(description_corpus, removeWords, stopwords("english"))
#remove a list of words
description_corpus <- tm_map(description_corpus, removeWords, c("the", "an", "a", "must", "and", "could", "would", "have", "set", "exists", "turn", "tale", "might", "else", "causing", "whose", "whos", "sets","put", "...", "always", "gets", "before","very", "loosely","with","various","very","into","bearing", "behind", "predictably", "follows", "eight","frantic", "tales", "lives", "dealing","eightyearold", "inadvertently", "finds", "thing", "take", "taken", "tries", "several", "kevin", "mccallister", "become", "commanding", "like", "three", "former", "general", "cynical", "griswold", "walter", "desparately", "frustrated", "never", "sent", "unfortunately", "becoming", "left", "becomes", "really", "begins", "marie", "miranda", "four", "seemingly", "meaning", "emily", "something", "also", "actually", "early", "despises", "thinks", "wagner", "jill", "mark", "appear", "reluctantly", "decided", "fending", "indulge", "wants", "matt", "meet", "comes", "going", "best", "think", "leaving", "thought", "approaching", "still", "heading", "picked", "decides", "seven", "shows", "learns", "lucas", "agrees", "happens", "robin", "mary", "jacob", "laurel", "facing", "quickly", "shocking", "peter", "mariah", "everett", "sebastian", "roped", "austin", "jokingly", "timothy", "emmanuel", "attends", "carlton", "mcandrick", "sarah", "palmer", "christmas", "christmastime", "summary", "»"," »", "«", " «","tasked", "helps", "things", "make", "keep", "falls", "makes", "another", "will", "come", "spend", "meet", "meets", "goes", "bring", "full", "takes", "just", "decide", "unexpectedly", "help", "show", "years", "find", "back", "york", "plot", "claus", "holiday", "young", "season", "time", "year", "small", "little", "girl", "woman", "man", "true", "plan", "plans", "lizzie", "virginia", "richfield", "pull", "begin", "foxworth", "maggie", "harper","whole", "current", "starts", "name", "zeus", "amalie", "hess", "brings", "receives","need", "pole", "place", "others", "plaza","existed","desparately","bumbling", "vermont", "adam", "calvin","langton", "anymore", "expect", "massey", "kyla", "cordinia","isadora", "leopold", "jonna", "nick", "julie", "walshrick", "mgtow", "briana", "laura", "kylie", "chloe", "evan", "third","noelle","lets", "serving", "even", "anna", "galwick", "chronicles", "story", "reel"))
#remove white space
description_corpus <- tm_map(description_corpus, stripWhitespace)