January 18, 2020

a quick tidyTuesday on Passwords

First, I wanted to acquire the distribution of letters and then play with that. I embedded the result here. The second step is to import the tidyTuesday data.

library(tidyverse)
Letter.Freq <- data.frame(stringsAsFactors=FALSE,
                     Letter = c("E", "T", "A", "O", "I", "N", "S", "R", "H", "D", "L", "U",
                                "C", "M", "F", "Y", "W", "G", "P", "B", "V",
                                "K", "X", "Q", "J", "Z"),
                  Frequency = c(12.02, 9.1, 8.12, 7.68, 7.31, 6.95, 6.28, 6.02, 5.92, 4.32,
                                3.98, 2.88, 2.71, 2.61, 2.3, 2.11, 2.09, 2.03,
                                1.82, 1.49, 1.11, 0.69, 0.17, 0.11, 0.1, 0.07)
               )
Letter.Freq <- Letter.Freq %>% mutate(Frequency = Frequency / 100, Letter = tolower(Letter))
passwords <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-14/passwords.csv')

Let me visualize the distribution of letters.

library(ggthemes)
ggplot(Letter.Freq) + aes(x=reorder(Letter, desc(Letter)), y=Frequency) + geom_col() + coord_flip() + theme_economist() + labs(x="")

I want to use the tm library for topic models to extract out the numbers and only work with letters. There is something probably worthwhile to do with Zipf’s law or the like on that.

library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
# Remove numbers
passwords <- passwords %>% mutate(PasswordNN = removeNumbers(password))
# Split each string into the letters; here I am repurposing a word count function
freq <- function(x,y) {
     word <- tolower(unlist(strsplit(x,"")))
     word_table <- table(word)
     ans <- word_table[names(word_table)==y]
     return(ans)
}
# Use sapply to apply the function.  This is not tidy but I have never sat down with map
Letter.FreqPW <- sapply(letters,function(x) { freq(passwords$PasswordNN, x) } )
# Rename the resulting vector with proper labels
names(Letter.FreqPW) <- letters
# Clean up the names and metrics
LFPW <- data.frame(Letter.FreqPW)
LFPW$Letter <- row.names(LFPW)
LFPW <- LFPW %>% mutate(Freq = Letter.FreqPW)
# Join a result
Res <- left_join(Letter.Freq, LFPW)
## Joining, by = "Letter"
Res
##    Letter Frequency Letter.FreqPW Freq
## 1       e    0.1202           327  327
## 2       t    0.0910           144  144
## 3       a    0.0812           271  271
## 4       o    0.0768           191  191
## 5       i    0.0731           178  178
## 6       n    0.0695           197  197
## 7       s    0.0628           176  176
## 8       r    0.0602           233  233
## 9       h    0.0592            82   82
## 10      d    0.0432            96   96
## 11      l    0.0398           144  144
## 12      u    0.0288            59   59
## 13      c    0.0271           115  115
## 14      m    0.0261            86   86
## 15      f    0.0230            38   38
## 16      y    0.0211            64   64
## 17      w    0.0209            42   42
## 18      g    0.0203            75   75
## 19      p    0.0182            71   71
## 20      b    0.0149            77   77
## 21      v    0.0111            37   37
## 22      k    0.0069            50   50
## 23      x    0.0017            36   36
## 24      q    0.0011             5    5
## 25      j    0.0010            26   26
## 26      z    0.0007            12   12

Turn it to a proportion.

# Create a proportion
Res <- Res %>% mutate(LF = Freq / sum(Freq))

Let me plot the result.

library(ggrepel)
gg1 <- ggplot(Res) + aes(x=Frequency, y=LF, label=Letter) + geom_label_repel(fill="white") + labs(x="Language Frequency", y="Frequency in Password", title="Password Letters vs. English Letter Frequency") + coord_equal() + geom_abline(slope=1, intercept=0) + ggthemes::theme_economist() 
gg1