library(bitops) library(RCurl) library(KoNLP) library(rJava) library(tm) library(wordcloud) library(XLConnect) setwd ("D:/Users/Hyo/Cs-Ds/CS/MusicStudy/data") rm(list=ls()) music<- file.path("km.xlsx") music <- readWorksheetFromFile(music, sheet="1970s") lyrics<- Corpus(VectorSource(music$lyrics)) result.text <- lyrics result.text <- gsub("\n", " ", result.text) result.text <- gsub("\r", " ", result.text) result.text <- gsub("º°µé", "º°", result.text) result.text <- gsub("±âµÕÀÌ", "±âµÕ", result.text) result.text <- gsub("¸¶³ª´Ô", "¸¶´©¶ó", result.text) result.text <- gsub("µþµé¾Æ", "µþ", result.text) result.text <- gsub("µû´Ô", "µþ", result.text) result.text <- gsub("´ÔÀÌ¿©", "´Ô", result.text) result.text <- gsub("´©±º°¡", "´©±¸", result.text) result.text <- gsub("´©±ºÁö", "´©±¸", result.text) result.text <- gsub("´©±¼", "´©±¸", result.text) result.text <- gsub("³»°Ô·Î", "³»", result.text) result.text <- gsub("³»°Ô¼­", "³»", result.text) result.text <- gsub("³ª´Â¾ß", "³ª", result.text) result.text <- gsub("¸Ô±¸¸§¾Æ", "¸Ô±¸¸§", result.text) result.text <- gsub("¸»¾¸", "¸»", result.text) result.text <- gsub("¸»ÇØ", "¸»", result.text) result.text <- gsub("¹«Áö°¹", "¹«Áö°³", result.text) result.text <- gsub("Ȧ·ÎÀÌ", "È¥ÀÚ", result.text) result.text <- gsub("¹°°á", "¹°", result.text) result.text <- gsub("¹Ù¶÷°á", "¹Ù¶÷", result.text) result.text <- gsub("¹Ù¶÷ºÒ¾î", "¹Ù¶÷", result.text) result.text <- gsub("»ç¶÷µé", "»ç¶÷", result.text) result.text <- gsub("»ç¶÷µé¾Æ", "»ç¶÷", result.text) result.text <- gsub("»ç¶ûÀ̾ú±â", "»ç¶û", result.text) result.text <- gsub("»ç¶ûÀ̾ú±â¿¡", "»ç¶û", result.text) result.text <- gsub("»çÀ§°¨", "»çÀ§", result.text) result.text <- gsub("¾îµð·ÐÁö", "¾îµð", result.text) result.text <- gsub("¾îµò°¡", "¾îµð", result.text) result.text <- gsub("¾îµð¼±°¡", "¾îµð", result.text) result.text <- gsub("¾î¸Ó´Ô", "¾î¸Ó´Ï", result.text) result.text <- gsub("¿©¿ïÁö´Â", "¿©¿ï", result.text) result.text <- gsub("¿ì¸®²¨¶ó¸é", "¿ì¸®", result.text) result.text <- gsub("¿ì¸®µé", "¿ì¸®", result.text) result.text <- gsub("È£¼ý°¡", "È£¼ö", result.text) result.text <- gsub("ÇÞºµ", "ÇÞ»ì", result.text) result.text <- gsub("ÇÞºû", "ÇÞ»ì", result.text) result.text <- gsub("ÇÞ»ì", "ÇÞ»ì ", result.text) result.text <- gsub("ÃÖÁø»ç³×", "ÃÖÁø»ç", result.text) result.text <- gsub("¢Ð", " ", result.text) result.text <- gsub("¢¿", " ", result.text) result.text <- gsub("¢Í", " ", result.text) result.text <- gsub("¢Ý", " ", result.text) result.text <- gsub("¨ç", " ", result.text) result.text <- gsub("¨è", " ", result.text) result.text <- gsub("¨é", " ", result.text) result.text <- gsub("¢Á", " ", result.text) result.text <- gsub("¢Ñ", " ", result.text) result.text <- gsub(",", ", ", result.text) result.text <- gsub("\\.", "\\. ", result.text) result.text <- gsub("\\+", " ", result.text) result.text <- gsub("\\-", " ", result.text) result.text <- gsub("\\:", " ", result.text) result.text <- gsub("\\(", " ", result.text) result.text <- gsub("\\)", " ", result.text) result.text <- gsub(" \n", "\n", result.text) result.text <- gsub("=", " ", result.text) result.text <- gsub("~", " ", result.text) result.text <- gsub("^_^", " ", result.text) result.text <- gsub("^ ", "", result.text) result.text <- gsub(" $", "", result.text) removeTwitSign <- function(x) { gsub("@[[:graph:]]*","",x) } removeURL <- function(x) { gsub("http://[[:graph:]]*","",x) } removeEnter <- function(x) { gsub("\n","",x) } exNouns <- function(x) { paste(extractNoun(x), collapse=" ")} # NA -> "" ·Î º¯È¯ result.text[is.na(result.text)] <- "" result.text <- gsub("[[:punct:]]", "", result.text) result.text <- gsub(" $", "", result.text) useSejongDic() mergeUserDic(data.frame(c( "°³¿©¿ï", "°¡½Ã³»", "²É°¡¸¶", "À®Å©", "²Ã", "¸¶·Î´Ï¿¡", "¸Ô±¸¸§", "¼­¿ï", "±×³à", "±×´ë", "ÀÌ»ÛÀÌ", "ÀÌ°÷", "Áý½Ã", "°¡°íÆÄ", "±âµÕ", "²Þ±æ", "µÑ·¯¸Þ´Ù", "ºÒ´Ù", "¼­±ÍÆ÷", "¼­´ÃÇÏ´Ù", "¼­ÇØ", "½Î´ÃÇÏ´Ù", "¾ÆÀÌ", "Åä¶óÁö´Ù" ), c("ncn"))) result_nouns <- sapply(result.text, exNouns) myCorpus <- Corpus(VectorSource(result_nouns)) myCorpus <- tm_map(myCorpus, removePunctuation) myCorpus <- tm_map(myCorpus, removeNumbers) myCorpus <- tm_map(myCorpus, tolower) myStopwords <- c(stopwords('english'), "rt", "Åä¶óÁ®", "Åä¶óÁ³À»±î", "Åä¶óÁö¸é", "°¡·Á¸¶", "°ÅÁö", "±âµÕÁö¿ä", "°£´Ù³×", "°£´Ù", "°¡¿¡", "°¡³ë¶ó", "°¡³ë¶ó¸é", "°¡³ë", "°¡°íÆÄ", "±«°í", "±¦½º·¹", "²¬²¬²¬", "³ªÁÈ÷", "³¯À¸³×", "³Ñ°ÜÁÖ¼Ò", "³ÒÁ×", "³²¾Æ", "³»°í", "³»¸®¼Ò¼­", "´Ò¸®¸®¾ß", "´Ò¸®¸®", "¶Ñ·ç", "µí", "µÇ¾ú´Ù³×", "µÇ", "µéÀÌ", "¶ö¶ö¶ó¶ö¶ó", "¶ö¶ö¶ó", "¶ö¶ö", "¶ö¶ó¶ó¶ö¶ó", "¶ö¶ó", "¶ö¶ö¶ö¶ó", "¶ö¶ö¶ö¶ö¶ó", "·Î¿î", "·ç·ç·ç", "·ç·ç", "¶ó", "¶ó¶ó¶ó¶ó", "¶ó¶ó¶ó¶ó¶ó", "¶ó¶ó¶ó¶ó¶ö¶ö¶ó", "¶ó¶ó¶ö¶ö¶ó", "¶ó¶ö¶ó", "¶ó¶ö¶ó¶ó", "¶ó¶ö¶ö¶ó", "¶ó¿À", "¶ö¶ö¶ó¶ö¶ö¶ó", "¶ö¶ö¶ö", "·ç·ç·ç·ç", "¸£", "¸øÇÒ°Ô", "¸ð¸¥´Ù³×", "¸¸³µ¼Ò", "¸¸³ª", "¸¶Á®", "¸¶ÁÖ", "¸ð¸¥´Ù³×", "¸¶´Ù", "¸¸Å­", "ÀØÀ¸¶ó½Ã¸é", "¿ö¿ì¿ö¿ì¿ö", "ÀÌ·¡", "ÀÌ·²±î", "¿Ö³Ä°í", "¿ÀÁú¾Ê³×", "¿¹¿¹¿¹¿¹", "¿¹¿¹¿¹", "¿©¸´", "¿¡Çì¾ßµ¥Çì¾ß", "¿¡Çì¾ß", "¾î¼´Ù", "¾î¼Ý¾ß", "¾î¶°³Ä°í", "¾Æ¾Æ¾Æ", "¾Æ·Õ´ë´Â", "¿¹¿¹¿¹", "¾î¼­¿Í´ëÇì¾ß", "¾ó¾¾±¸³ª", "¾ø´Ù", "¾ø´Ù³×", "¾øÀ»°É", "¿øÄ¡", "½Í¼Ò", "½Î´Ã", "½º¸®", "½º¹«", "½ºÅç¶ó", "¼Ò±Ù´ë´Â", "»ì°Ú¼Ò", "»ì°íÆÄ¿ä", "»ìÀ¸·Æ´Ï´Ù", "»¯´Â°í", "»¯´Â", "ºÃ´Ù³ª¿ä", "ºË°í", "ºÎ¸£¸®±î", "º¸ÀÌÀÝ´Ï", "º¸ÀÌ", "º¸¾Ò¼Ò", "º¸°í", "º¸°íÆÄÁú", "º¯Ä¡¸»ÀÚ", "ºÎ¼¼¿ä", "ºÎ¼¼","¹Ý±â", "¹ö¸±·¡", "Ç°À¸·Æ´Ï´Ù", "Ç°ÆÄ", "Ç°Ç°Ç°", "Àý¾¾±¸³ª", "ÁÖ·Á¹«³ª", "ÁÁÀ»½Ã°í", "ª´Ù¶õ", "ª±â¸¸ÇÑ", "Å͸¥", "Åä·Ï", "Çʸ®¸®ÀÌ", "ó·³", "ÇÏ°Ô", "ÇÏ°íÆÄ", "ÇÏ°íÆĵµ", "ÇÏ°íÇÂ", "Çß´Ù³×", "Çß¾ú¿À", "Çï·Î¾Æ", "È츶È츶È츶", "È츶È츶È츶È츶¿¹", "Çغ¼°É", "Çؼ­", "Çصµ", "ÇÔ", "ÇÒ°É", "ÇÛ¾³ÇÑ", "ÇϷžÆ", "Çϸ®", "Çϸ®±î", "ÇϼÒ", "ÇÏ¿À", "ÇÑ´Ü", "Çѵ¥", "¶ó¶ó¶ó" ) myCorpus <-tm_map(myCorpus, removeWords, myStopwords) inspect(myCorpus[1:5]) myTdm2 <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(2,Inf))) myTdm1 <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(1,Inf))) mat2 <- as.data.frame(as.matrix(myTdm2)) mat1 <- as.data.frame(as.matrix(myTdm1)) write.table(mat2, file="_lyrics_70s_2.txt", col.names=FALSE, row.names=TRUE,sep="\t") write.table(mat1, file="_lyrics_70s_1.txt", col.names=FALSE, row.names=TRUE,sep="\t") myTdm <- myTdm1 pal <- brewer.pal(8,"Dark2") # ÆùÆ® ¼¼ÆÃ. ¶ç¾î¾²±â³ª ´ë¼Ò¹®ÀÚ¿¡ ¹Î°¨ÇÏ´Ù´Â Á¡¿¡ ÁÖÀÇ # ¸¼Àº°íµñ : windowsFonts(malgun=windowsFont("¸¼Àº °íµñ")) # ³ª´®°íµñ : windowsFonts(malgun=windowsFont("³ª´®°íµñ")) windowsFonts(malgun=windowsFont("¸¼Àº °íµñ")) m <- as.matrix(myTdm) # calculate the frequency of words v <- sort(rowSums(m), decreasing=TRUE) myNames <- names(v) k <- which(names(v)=="apple") myNames[k] <- "apple" d <- data.frame(word=myNames, freq=v) #wordcloud(d$word, d$freq, scale=c(4,0.5), min.freq=3, random.order=F, rot.per=.1, family="malgun") wordcloud(d$word, d$freq, scale=c(7,0.8), min.freq=5, random.order=F, rot.per=.1, colors=pal, family="malgun") dev.copy(png,"1970s-1.png",width=8,height=6,units="in",res=200) dev.off() myTdm <- myTdm2 pal <- brewer.pal(8,"Dark2") # ÆùÆ® ¼¼ÆÃ. ¶ç¾î¾²±â³ª ´ë¼Ò¹®ÀÚ¿¡ ¹Î°¨ÇÏ´Ù´Â Á¡¿¡ ÁÖÀÇ # ¸¼Àº°íµñ : windowsFonts(malgun=windowsFont("¸¼Àº °íµñ")) # ³ª´®°íµñ : windowsFonts(malgun=windowsFont("³ª´®°íµñ")) windowsFonts(malgun=windowsFont("¸¼Àº °íµñ")) m <- as.matrix(myTdm) # calculate the frequency of words v <- sort(rowSums(m), decreasing=TRUE) myNames <- names(v) k <- which(names(v)=="apple") myNames[k] <- "apple" d <- data.frame(word=myNames, freq=v) #wordcloud(d$word, d$freq, scale=c(4,0.5), min.freq=3, random.order=F, rot.per=.1, family="malgun") wordcloud(d$word, d$freq, scale=c(7,0.8), min.freq=5, random.order=F, rot.per=.1, colors=pal, family="malgun") dev.copy(png,"1970s-2.png",width=8,height=6,units="in",res=200) dev.off()