a quick tidyTuesday on Passwords
First, I wanted to acquire the distribution of letters and then play with that. I embedded the result here. The second step is to import the tidyTuesday data.
library(tidyverse)
Letter.Freq <- data.frame(stringsAsFactors=FALSE,
Letter = c("E", "T", "A", "O", "I", "N", "S", "R", "H", "D", "L", "U",
"C", "M", "F", "Y", "W", "G", "P", "B", "V",
"K", "X", "Q", "J", "Z"),
Frequency = c(12.02, 9.1, 8.12, 7.68, 7.31, 6.95, 6.28, 6.02, 5.92, 4.32,
3.98, 2.88, 2.71, 2.61, 2.3, 2.11, 2.09, 2.03,
1.82, 1.49, 1.11, 0.69, 0.17, 0.11, 0.1, 0.07)
)
Letter.Freq <- Letter.Freq %>% mutate(Frequency = Frequency / 100, Letter = tolower(Letter))
passwords <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-14/passwords.csv')
Let me visualize the distribution of letters.
library(ggthemes)
ggplot(Letter.Freq) + aes(x=reorder(Letter, desc(Letter)), y=Frequency) + geom_col() + coord_flip() + theme_economist() + labs(x="")
I want to use the tm
library for topic models to extract out the numbers and only work with letters. There is something probably worthwhile to do with Zipf’s law or the like on that.
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
# Remove numbers
passwords <- passwords %>% mutate(PasswordNN = removeNumbers(password))
# Split each string into the letters; here I am repurposing a word count function
freq <- function(x,y) {
word <- tolower(unlist(strsplit(x,"")))
word_table <- table(word)
ans <- word_table[names(word_table)==y]
return(ans)
}
# Use sapply to apply the function. This is not tidy but I have never sat down with map
Letter.FreqPW <- sapply(letters,function(x) { freq(passwords$PasswordNN, x) } )
# Rename the resulting vector with proper labels
names(Letter.FreqPW) <- letters
# Clean up the names and metrics
LFPW <- data.frame(Letter.FreqPW)
LFPW$Letter <- row.names(LFPW)
LFPW <- LFPW %>% mutate(Freq = Letter.FreqPW)
# Join a result
Res <- left_join(Letter.Freq, LFPW)
## Joining, by = "Letter"
Res
## Letter Frequency Letter.FreqPW Freq
## 1 e 0.1202 327 327
## 2 t 0.0910 144 144
## 3 a 0.0812 271 271
## 4 o 0.0768 191 191
## 5 i 0.0731 178 178
## 6 n 0.0695 197 197
## 7 s 0.0628 176 176
## 8 r 0.0602 233 233
## 9 h 0.0592 82 82
## 10 d 0.0432 96 96
## 11 l 0.0398 144 144
## 12 u 0.0288 59 59
## 13 c 0.0271 115 115
## 14 m 0.0261 86 86
## 15 f 0.0230 38 38
## 16 y 0.0211 64 64
## 17 w 0.0209 42 42
## 18 g 0.0203 75 75
## 19 p 0.0182 71 71
## 20 b 0.0149 77 77
## 21 v 0.0111 37 37
## 22 k 0.0069 50 50
## 23 x 0.0017 36 36
## 24 q 0.0011 5 5
## 25 j 0.0010 26 26
## 26 z 0.0007 12 12
Turn it to a proportion.
# Create a proportion
Res <- Res %>% mutate(LF = Freq / sum(Freq))
Let me plot the result.
library(ggrepel)
gg1 <- ggplot(Res) + aes(x=Frequency, y=LF, label=Letter) + geom_label_repel(fill="white") + labs(x="Language Frequency", y="Frequency in Password", title="Password Letters vs. English Letter Frequency") + coord_equal() + geom_abline(slope=1, intercept=0) + ggthemes::theme_economist()
gg1