Main issues for today
library(tidyverse)
[30m── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 2.2.1 [32m✔[30m [34mpurrr [30m 0.2.5
[32m✔[30m [34mtibble [30m 1.4.2 [32m✔[30m [34mdplyr [30m 0.7.5
[32m✔[30m [34mtidyr [30m 0.8.1 [32m✔[30m [34mstringr[30m 1.3.1
[32m✔[30m [34mreadr [30m 1.1.1 [32m✔[30m [34mforcats[30m 0.3.0[39m
[30m── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
library(babynames)
Kelly visualisation:
read_tsv("./Swedish-Kelly_M3_CEFR.tsv") %>%
arrange(desc(`Raw freq`)) %>%
filter(!is.na(`Raw freq`)) %>%
filter(WPM != 1000000) %>%
mutate(Rank=1:length(ID)) %>%
ggplot(aes(x=log(Rank), y=log(`Raw freq`))) + geom_line()
Parsed with column specification:
cols(
ID = col_integer(),
`Raw freq` = col_integer(),
WPM = col_double(),
`CEFR levels` = col_character(),
Source = col_character(),
Grammar = col_character(),
`Swedish items for translation` = col_character(),
`Word classes` = col_character(),
Examples = col_character()
)
Another version of the same thing, less “idiomatic” tidyverse (old-fashioned way)
kelly <- read_tsv("Swedish-Kelly_M3_CEFR.tsv")
Parsed with column specification:
cols(
ID = col_integer(),
`Raw freq` = col_integer(),
WPM = col_double(),
`CEFR levels` = col_character(),
Source = col_character(),
Grammar = col_character(),
`Swedish items for translation` = col_character(),
`Word classes` = col_character(),
Examples = col_character()
)
kelly <- filter(kelly, !(is.na(`Raw freq`) | WPM == 1000000))
kelly$rank <- 1:nrow(kelly)
kelly %>% ggplot(aes(rank, `Raw freq`)) + geom_line()
Now plot this again taking the log values of rank and frequency
read_tsv("Swedish-Kelly_M3_CEFR.tsv") %>%
filter(!(is.na(`Raw freq`) | WPM == 1000000)) %>%
mutate(rank=1:nrow(.)) %>%
ggplot(aes(log10(rank), log10(`Raw freq`))) + geom_line()
Parsed with column specification:
cols(
ID = col_integer(),
`Raw freq` = col_integer(),
WPM = col_double(),
`CEFR levels` = col_character(),
Source = col_character(),
Grammar = col_character(),
`Swedish items for translation` = col_character(),
`Word classes` = col_character(),
Examples = col_character()
)
What’s that “blip”? How can we fix it?
# Sort it by Raw freq, descending
kelly %>% arrange(desc(`Raw freq`))
And then redo everything with the fix incorporated
read_tsv("Swedish-Kelly_M3_CEFR.tsv") %>%
arrange(desc(`Raw freq`)) %>%
filter(!(is.na(`Raw freq`) | WPM == 1000000)) %>%
mutate(rank=1:nrow(.)) %>%
ggplot(aes(log10(rank), log10(`Raw freq`))) + geom_line()
Parsed with column specification:
cols(
ID = col_integer(),
`Raw freq` = col_integer(),
WPM = col_double(),
`CEFR levels` = col_character(),
Source = col_character(),
Grammar = col_character(),
`Swedish items for translation` = col_character(),
`Word classes` = col_character(),
Examples = col_character()
)
Wide data: