# Last of the 3 programs in R. It was intended to calculate the standard deviation of numbers mentioned in various texts, after I discovered I need elevation to use MSI Afterburner to collect FPS measurements. All the extra code for generating number words for a particular language (let alone a flective one) is the reason why I plain out refuse to correct any numeral even in the most formal papers to be written out in individual words as if writing a check.
numberwordsbase = c("one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve")
numberwordsadd = c("teen", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand")
# there is no "zero" in the source material, so let's simplify indexing
# also don't really mind "one" as the usage is quite complicated
numberwords = c(numberwordsbase, "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen")
i = 20
while(i <= 100){
if(i%%10 == 0){
numberwords = c(numberwords, numberwordsadd[i%/%10])
}else{
numberwords = c(numberwords, paste(numberwordsadd[i%/%10], numberwordsbase[i%%10], sep = "-"))
}
i = i + 1
}
# don't mind numbers over 100 either, there is like handful of them
#while(i < 1000){
# if(i%%100 == 0){
# numberwords = c(numberwords, paste(numberwordsbase[i%/%100], numberwordsadd[10]))
# }else if(i%%10 == 0){
# numberwords = c(numberwords, paste(numberwordsbase[i%/%100], numberwordsadd[10], numberwordsadd[i%%100%/%10]))
# }else{
# numberwords = c(numberwords, paste(numberwordsbase[i%/%100], " ", numberwordsadd[10], " ", numberwordsadd[i%/%10%%100], "-", numberwordsbase[i%%10], sep = ""))
# }
# i = i + 1
#}
freqs = rep(0, 100)
longtext = readLines("..\\Downloads\\135-0-les-miserables.txt") # modify the path
words = NULL
i = 1
while(i <= length(longtext)){
cat("\r", i, "of", length(longtext), "lines split")
linewords = strsplit(longtext[i], "[^0-9A-Za-z\-]")
j = 1
while(j <= length(linewords[[1]])){ # list of factors strsplit returns then
if(linewords[[1]][j] != ""){
words = c(words, linewords[[1]][j])
}
j = j + 1
}
i = i + 1
}
for(i in words){
for(j in numberwords){
if(i == j){
freqs[match(j, numberwords)] = freqs[match(j, numberwords)] + 1
}
}
}
print(freqs)
average = sum(freqs)/length(numberwords)
cat("Average:", average, "\n")
cat("Deviations: ", freqs-average, "\n")
stddev = sqrt(sum((freqs - average)^2) / (length(numberwords)-1))
cat("Standard deviation:", stddev, "\n")
quantiles = quantile(freqs, c(0, 0.001, 0.01, 0.1, 0.5, 1))
barplot(freqs, horiz = T, beside = F)
No comments:
Post a Comment
Barely anyone comments, so I don't moderate. Free advertising, I guess.