Revision

Humans tend to prefer “wide format” data for reading/writing

Wide versus long
Wide versus long

The key to the ggplot (Grammar of Graphics) approach is “long format”. In the long format it is easy to map dimensions of your data (the various things you know about an observation) to dimensions of your plot.

Mapping dimensions The gather function converts wide data to long data. Do this when several different columns are indicating something which should be graphed on a single dimension.

data <- tibble(
  speaker=c("Speaker 1", "Speaker 2", "Speaker 3"), 
  Q1=c(1,3,1), 
  Q2=c(2,2,2), 
  Q3=c(1,2,2))
data

Gather:

# data %>% gather(key = "key", value = "value",c("The", "Columns", "To", "Gather"))
data %>% gather("question", "score", c("Q1", "Q2", "Q3"))
data %>% 
  gather("question", "score", c("Q1", "Q2", "Q3")) %>% 
  ggplot(aes(x=question, y=score, colour=speaker)) + geom_jitter(height=0, width=0.1) 

(note: geom_jitter is like geom_point except that it adds a bit of random variation to each value; useful to avoid overplotting)

Use the spread function when you need to have two observations in one row, for instance if you need to make a compount score:

babynames %>% group_by(year, sex) %>% summarise(total.n=sum(n))

What if we wanted proportion female?

babynames %>% 
  group_by(year, sex) %>% 
  summarise(total.n=sum(n)) %>%
  spread(sex, total.n)
Spread
Spread

GREP

Grep review

  1. Are there any four letter names starting “Eri” apart from “Erik” and “Eric”?
  2. How many spelling variants of Mary/Maria can you find in a single search? How about Christine/Kristen etc. )
  3. Can you think of a way (using what we’ve learned) to get the second character of a string?

Question 1

Are there any four letter names starting “Eri” apart from “Erik” and “Eric”?

babynames %>% pull(name) %>% unique() %>% str_subset("Eri[^ck]$")
# babynames %>% filter(str_detect(name, "Eri[^ck]$"))

Question 2

How many spelling variants of Mary/Maria can you find in a single search?

all_names <- babynames %>% pull(name) %>% unique() 
all_names %>% str_subset("Mar+(y|ie|ye|i)$")
all_names %>% str_subset("Ma[aeiouh]?r+[aiey]{1,2}$")

how about the same for Christine etc.

all_names %>% str_subset("[CK]h?ristin[ae]$")
all_names %>% str_subset("Jean(pierre|claude|micha?el)")

Question 3

Can you think of a way (using what we’ve learned) to get the second character of a string?

str_sub("Eric", 2, 2)
[1] "r"
babynames %>% mutate(second_letter=str_extract(name, "[^A-Z]$"))

Facets

Multiple repeat plots — a way of adding one more dimension

There are two facet functions:

babynames %>% 
  mutate(final_vowel=str_extract(name, "[aeiouy]$")) %>%
  filter(!is.na(final_vowel)) %>% 
  group_by(year, sex, final_vowel) %>% summarise(total=sum(n)) %>% 
  ggplot(aes(x=year, y=log10(total))) + geom_line(aes(colour=sex)) + facet_wrap(~ final_vowel)

More-or-less the same thing again, but using str_sub instead of a regular expression:

babynames %>% mutate(final_letter=str_sub(name, -1, -1)) %>% group_by(year, sex, final_letter) %>% summarise(total=sum(n)) %>% ggplot(aes(x=year, y=log10(total), colour=sex)) + geom_line() + facet_wrap(~ final_letter)

If-then-else and facet_grid example

get_manner <- function(C) {
  if (C %in% c("B","D","G")){
    return("stop, voiced")
  } else if (C %in% c("P", "T", "K")){
    return("stop, voiceless")
  } else if (C %in% c("M", "N")){
    return("nasal")
  } else if (C %in% c("F", "S", "H")){
    return("fricative, voiceless")
  } else if (C %in% c("V", "Z")){
    return("fricative, voiced")
  } else return(NA)
}
get_place <- function(C) {
  if (C %in% c("B", "P", "F", "V", "M")){
    return("bilabial")
  } else if (C %in% c("D", "T", "S", "Z", "N")){
    return("alveolar")
  } else if (C %in% c("G", "K", "H")){
    return("velar etc.")
  } else {return(NA)}
}
babynames %>% mutate(initial_C=factor(str_extract(name, "^[BDGPTKMNVFSVZH]"))) %>%
  filter(!is.na(initial_C)) %>%
  group_by(year, sex, initial_C) %>%
  summarise(total=sum(n)) %>%
  mutate(
    manner=factor(map_chr(initial_C, get_manner),
                  levels=c("stop, voiced", "stop, voiceless", "fricative, voiced", "fricative, voiceless", "nasal")), 
    place=factor(map_chr(initial_C, get_place), 
                 levels=c("bilabial", "alveolar", "velar etc."))) %>%
  ggplot(aes(x=year, y=total, colour=sex)) + geom_line() + facet_grid(manner ~ place) + geom_text(x=1900, y=200000, colour="black", aes(label=initial_C)) + labs(title="Popularity of selected initial consonants in given names")

Mapping functions

Produce a vector from a function

Take names and capitalise all the vowel-final

vfinal2cap <- function(word){
  if (str_detect(word, "[aeiouy]$")){
    return(str_to_upper(word))
  } else {return(str_to_lower(word))}
}
vfinal2cap("Emma")
vfinal2cap("Bob")
babynames %>% pull(name) %>% unique() -> all_names
map_chr(all_names, vfinal2cap) %>% head(24)
LS0tCnRpdGxlOiAnTGVjdHVyZSA2OiBEYXRhIGV4cGxvcmF0aW9uJwphdXRob3I6ICJNaWNoYWVsIER1bm4sIERlcHQuIG9mIExpbmd1aXN0aWNzIGFuZCBQaGlsb2xvZ3ksIFVwcHNhbGEgVW5pdmVyc2l0eSIKZGF0ZTogIkxlY3R1cmUgNiwgMjAxOC0wNS0wMiIKb3V0cHV0OgogIGh0bWxfZG9jdW1lbnQ6CiAgICBkZl9wcmludDogcGFnZWQKLS0tCgpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0KbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkoYmFieW5hbWVzKQpsaWJyYXJ5KHN0cmluZ3IpCmBgYAoKIyMgUmV2aXNpb24KCkh1bWFucyB0ZW5kIHRvIHByZWZlciAid2lkZSBmb3JtYXQiIGRhdGEgZm9yIHJlYWRpbmcvd3JpdGluZwoKIVsqV2lkZSB2ZXJzdXMgbG9uZypdKC4vd2lkZS12cy1sb25nLnBuZykKClRoZSBrZXkgdG8gdGhlIGdncGxvdCAoR3JhbW1hciBvZiBHcmFwaGljcykgYXBwcm9hY2ggaXMgImxvbmcgZm9ybWF0Ii4gSW4gdGhlIGxvbmcgZm9ybWF0IGl0IGlzIGVhc3kgdG8gbWFwIGRpbWVuc2lvbnMgb2YgeW91ciBkYXRhICh0aGUgdmFyaW91cyB0aGluZ3MgeW91IGtub3cgYWJvdXQgYW4gb2JzZXJ2YXRpb24pIHRvIGRpbWVuc2lvbnMgb2YgeW91ciBwbG90LgoKIVsqTWFwcGluZyBkaW1lbnNpb25zKl0oLi9hZXN0aGV0aWMucG5nKQpUaGUgYGdhdGhlcmAgZnVuY3Rpb24gY29udmVydHMgd2lkZSBkYXRhIHRvIGxvbmcgZGF0YS4gRG8gdGhpcyB3aGVuIHNldmVyYWwgZGlmZmVyZW50IGNvbHVtbnMgYXJlIGluZGljYXRpbmcgc29tZXRoaW5nIHdoaWNoIHNob3VsZCBiZSBncmFwaGVkIG9uIGEgc2luZ2xlIGRpbWVuc2lvbi4KCmBgYHtyfQpkYXRhIDwtIHRpYmJsZSgKICBzcGVha2VyPWMoIlNwZWFrZXIgMSIsICJTcGVha2VyIDIiLCAiU3BlYWtlciAzIiksIAogIFExPWMoMSwzLDEpLCAKICBRMj1jKDIsMiwyKSwgCiAgUTM9YygxLDIsMikpCmRhdGEKYGBgCgpHYXRoZXI6IAohW10oLi9nYXRoZXIucG5nKQoKYGBge3J9CiMgZGF0YSAlPiUgZ2F0aGVyKGtleSA9ICJrZXkiLCB2YWx1ZSA9ICJ2YWx1ZSIsYygiVGhlIiwgIkNvbHVtbnMiLCAiVG8iLCAiR2F0aGVyIikpCmRhdGEgJT4lIGdhdGhlcigicXVlc3Rpb24iLCAic2NvcmUiLCBjKCJRMSIsICJRMiIsICJRMyIpKQpgYGAKYGBge3J9CmRhdGEgJT4lIAogIGdhdGhlcigicXVlc3Rpb24iLCAic2NvcmUiLCBjKCJRMSIsICJRMiIsICJRMyIpKSAlPiUgCiAgZ2dwbG90KGFlcyh4PXF1ZXN0aW9uLCB5PXNjb3JlLCBjb2xvdXI9c3BlYWtlcikpICsgZ2VvbV9qaXR0ZXIoaGVpZ2h0PTAsIHdpZHRoPTAuMSkgCmBgYAoobm90ZTogYGdlb21faml0dGVyYCBpcyBsaWtlIGBnZW9tX3BvaW50YCBleGNlcHQgdGhhdCBpdCBhZGRzIGEgYml0IG9mIHJhbmRvbSB2YXJpYXRpb24gdG8gZWFjaCB2YWx1ZTsgdXNlZnVsIHRvIGF2b2lkIG92ZXJwbG90dGluZykKClVzZSB0aGUgYHNwcmVhZGAgZnVuY3Rpb24gd2hlbiB5b3UgKm5lZWQqIHRvIGhhdmUgdHdvIG9ic2VydmF0aW9ucyBpbiBvbmUgcm93LCBmb3IgaW5zdGFuY2UgaWYgeW91IG5lZWQgdG8gbWFrZSBhIGNvbXBvdW50IHNjb3JlOgoKYGBge3J9CiMgVGhlIG51bWJlciBvZiBtYWxlIGFuZCBmZW1hbGUgYmFiaWVzIGVhY2ggeWVhcgpiYWJ5bmFtZXMgJT4lIGdyb3VwX2J5KHllYXIsIHNleCkgJT4lIHN1bW1hcmlzZSh0b3RhbC5uPXN1bShuKSkKYGBgCldoYXQgaWYgd2Ugd2FudGVkICpwcm9wb3J0aW9uKiBmZW1hbGU/IAoKYGBge3J9CmJhYnluYW1lcyAlPiUgCiAgZ3JvdXBfYnkoeWVhciwgc2V4KSAlPiUgCiAgc3VtbWFyaXNlKHRvdGFsLm49c3VtKG4pKSAlPiUKICBzcHJlYWQoc2V4LCB0b3RhbC5uKQpgYGAKCiFbKlNwcmVhZCpdKC4vc3ByZWFkLnBuZykKCiMjIEdSRVAKCi0gU2VlIHNsaWRlc2hvdwoKR3JlcCByZXZpZXcKCjEuIEFyZSB0aGVyZSBhbnkgZm91ciBsZXR0ZXIgbmFtZXMgc3RhcnRpbmcgIkVyaSIgYXBhcnQgZnJvbSAiRXJpayIgYW5kICJFcmljIj8KMi4gSG93IG1hbnkgc3BlbGxpbmcgdmFyaWFudHMgb2YgTWFyeS9NYXJpYSBjYW4geW91IGZpbmQgaW4gYSBzaW5nbGUgc2VhcmNoPyBIb3cgYWJvdXQgQ2hyaXN0aW5lL0tyaXN0ZW4gZXRjLiApCjMuIENhbiB5b3UgdGhpbmsgb2YgYSB3YXkgKHVzaW5nIHdoYXQgd2UndmUgbGVhcm5lZCkgdG8gZ2V0IHRoZSAqc2Vjb25kKiBjaGFyYWN0ZXIgb2YgYSBzdHJpbmc/CgojIyBRdWVzdGlvbiAxCgpBcmUgdGhlcmUgYW55IGZvdXIgbGV0dGVyIG5hbWVzIHN0YXJ0aW5nICJFcmkiIGFwYXJ0IGZyb20gIkVyaWsiIGFuZCAiRXJpYyI/CgpgYGB7cn0KYmFieW5hbWVzICU+JSBwdWxsKG5hbWUpICU+JSB1bmlxdWUoKSAlPiUgc3RyX3N1YnNldCgiRXJpW15ja10kIikKIyBiYWJ5bmFtZXMgJT4lIGZpbHRlcihzdHJfZGV0ZWN0KG5hbWUsICJFcmlbXmNrXSQiKSkKYGBgCgojIyBRdWVzdGlvbiAyCgpIb3cgbWFueSBzcGVsbGluZyB2YXJpYW50cyBvZiBNYXJ5L01hcmlhIGNhbiB5b3UgZmluZCBpbiBhIHNpbmdsZSBzZWFyY2g/CgpgYGB7cn0KYWxsX25hbWVzIDwtIGJhYnluYW1lcyAlPiUgcHVsbChuYW1lKSAlPiUgdW5pcXVlKCkgCmFsbF9uYW1lcyAlPiUgc3RyX3N1YnNldCgiTWFyKyh5fGllfHllfGkpJCIpCmBgYAoKYGBge3J9CmFsbF9uYW1lcyAlPiUgc3RyX3N1YnNldCgiTWFbYWVpb3VoXT9yK1thaWV5XXsxLDJ9JCIpCmBgYAoKaG93IGFib3V0IHRoZSBzYW1lIGZvciBDaHJpc3RpbmUgZXRjLgoKLSB3aXRoIEsgYXQgdGhlIHN0YXJ0IGluc3RlYWQgb2YgQ2gsIAotIEMgd2l0aG91dCBoLCAKLSB3aXRoIGEgYXQgdGhlIGVuZAoKYGBge3J9CmFsbF9uYW1lcyAlPiUgc3RyX3N1YnNldCgiW0NLXWg/cmlzdGluW2FlXSQiKQpgYGAKCmBgYHtyfQphbGxfbmFtZXMgJT4lIHN0cl9zdWJzZXQoIkplYW4ocGllcnJlfGNsYXVkZXxtaWNoYT9lbCkiKQpgYGAKCiMjIFF1ZXN0aW9uIDMKCkNhbiB5b3UgdGhpbmsgb2YgYSB3YXkgKHVzaW5nIHdoYXQgd2UndmUgbGVhcm5lZCkgdG8gZ2V0IHRoZSAqc2Vjb25kKiBjaGFyYWN0ZXIgb2YgYSBzdHJpbmc/Ci0tLS0KCi0gVGhlIG5vbi1yZWdleCB3YXkgaXMgYmVzdCBoZXJlLApgYGB7cn0KCnN0cl9zdWIoIkVyaWMiLCAyLCAyKQpgYGAKCgpgYGB7cn0KYmFieW5hbWVzICU+JSBtdXRhdGUoc2Vjb25kX2xldHRlcj1zdHJfZXh0cmFjdChuYW1lLCAiW15BLVpdJCIpKQpgYGAKCiMjIEZhY2V0cwoKTXVsdGlwbGUgcmVwZWF0IHBsb3RzIOKAlCBhIHdheSBvZiBhZGRpbmcgb25lIG1vcmUgZGltZW5zaW9uCgpUaGVyZSBhcmUgdHdvIGZhY2V0IGZ1bmN0aW9uczogCgotIGBmYWNldF93cmFwYCB3aGVuIHlvdSB3YW50IHRvIHJlcGVhdCBhIHBsb3QgYWNjb3JkaW5nIGEgc2luZ2xlIGRpbWVuc2lvbgotIGBmYWNldF9ncmlkYCB3aGVuIHlvdSB3YW50IHRvIHJlcGVhdCBhIHBsb3QgYWNjb3JkaW5nIHRvIHR3byBkaW1lbnNpb25zCgpgYGB7cn0KYmFieW5hbWVzICU+JSAKICBtdXRhdGUoZmluYWxfdm93ZWw9c3RyX2V4dHJhY3QobmFtZSwgIlthZWlvdXldJCIpKSAlPiUKICBmaWx0ZXIoIWlzLm5hKGZpbmFsX3Zvd2VsKSkgJT4lIAogIGdyb3VwX2J5KHllYXIsIHNleCwgZmluYWxfdm93ZWwpICU+JSAKICBzdW1tYXJpc2UodG90YWw9c3VtKG4pKSAlPiUgCiAgZ2dwbG90KGFlcyh4PXllYXIsIHk9bG9nMTAodG90YWwpKSkgKyBnZW9tX2xpbmUoYWVzKGNvbG91cj1zZXgpKSArIGZhY2V0X3dyYXAofiBmaW5hbF92b3dlbCkKYGBgCgpNb3JlLW9yLWxlc3MgdGhlIHNhbWUgdGhpbmcgYWdhaW4sIGJ1dCB1c2luZyBzdHJfc3ViIGluc3RlYWQgb2YgYSByZWd1bGFyIGV4cHJlc3Npb246CgpgYGB7cn0KYmFieW5hbWVzICU+JSAKICBtdXRhdGUoZmluYWxfbGV0dGVyPXN0cl9zdWIobmFtZSwgLTEsIC0xKSkgJT4lCiAgZ3JvdXBfYnkoeWVhciwgc2V4LCBmaW5hbF9sZXR0ZXIpICU+JSAKICBzdW1tYXJpc2UodG90YWw9c3VtKG4pKSAlPiUgCiAgZ2dwbG90KGFlcyh4PXllYXIsIHk9bG9nMTAodG90YWwpLCBjb2xvdXI9c2V4KSkgKyBnZW9tX2xpbmUoKSArIGZhY2V0X3dyYXAofiBmaW5hbF9sZXR0ZXIpCmBgYAojIyBJZi10aGVuLWVsc2UgYW5kIGZhY2V0X2dyaWQgZXhhbXBsZQoKKiBUYWtlIHRoZSBjb25zb25hbnRzLCBjYXRlZ29yaXNlIHRoZW0gYnkgKnBsYWNlKiBhbmQgKm1hbm5lciogb2YgYXJ0aWN1bGF0aW9uLCB1c2UgKipmYWNldF9ncmlkKioKKiBtYXBfaW50LCBtYXBfZGJsLCAqKm1hcF9jaHIqKiAoZXF1aXZhbGVudHMgdG8gc2FwcGx5LCBwYXJ0IG9mIHB1cnJyKQoqIGxlZ2VuZHMgaHR0cDovL3d3dy5jb29rYm9vay1yLmNvbS9HcmFwaHMvTGVnZW5kc18oZ2dwbG90MikvCgpgYGB7cn0KZ2V0X21hbm5lciA8LSBmdW5jdGlvbihDKSB7CiAgaWYgKEMgJWluJSBjKCJCIiwiRCIsIkciKSl7CiAgICByZXR1cm4oInN0b3AsIHZvaWNlZCIpCiAgfSBlbHNlIGlmIChDICVpbiUgYygiUCIsICJUIiwgIksiKSl7CiAgICByZXR1cm4oInN0b3AsIHZvaWNlbGVzcyIpCiAgfSBlbHNlIGlmIChDICVpbiUgYygiTSIsICJOIikpewogICAgcmV0dXJuKCJuYXNhbCIpCiAgfSBlbHNlIGlmIChDICVpbiUgYygiRiIsICJTIiwgIkgiKSl7CiAgICByZXR1cm4oImZyaWNhdGl2ZSwgdm9pY2VsZXNzIikKICB9IGVsc2UgaWYgKEMgJWluJSBjKCJWIiwgIloiKSl7CiAgICByZXR1cm4oImZyaWNhdGl2ZSwgdm9pY2VkIikKICB9IGVsc2UgcmV0dXJuKE5BKQp9CgpnZXRfcGxhY2UgPC0gZnVuY3Rpb24oQykgewogIGlmIChDICVpbiUgYygiQiIsICJQIiwgIkYiLCAiViIsICJNIikpewogICAgcmV0dXJuKCJiaWxhYmlhbCIpCiAgfSBlbHNlIGlmIChDICVpbiUgYygiRCIsICJUIiwgIlMiLCAiWiIsICJOIikpewogICAgcmV0dXJuKCJhbHZlb2xhciIpCiAgfSBlbHNlIGlmIChDICVpbiUgYygiRyIsICJLIiwgIkgiKSl7CiAgICByZXR1cm4oInZlbGFyIGV0Yy4iKQogIH0gZWxzZSB7cmV0dXJuKE5BKX0KfQpiYWJ5bmFtZXMgJT4lIG11dGF0ZShpbml0aWFsX0M9ZmFjdG9yKHN0cl9leHRyYWN0KG5hbWUsICJeW0JER1BUS01OVkZTVlpIXSIpKSkgJT4lCiAgZmlsdGVyKCFpcy5uYShpbml0aWFsX0MpKSAlPiUKICBncm91cF9ieSh5ZWFyLCBzZXgsIGluaXRpYWxfQykgJT4lCiAgc3VtbWFyaXNlKHRvdGFsPXN1bShuKSkgJT4lCiAgbXV0YXRlKAogICAgbWFubmVyPWZhY3RvcihtYXBfY2hyKGluaXRpYWxfQywgZ2V0X21hbm5lciksCiAgICAgICAgICAgICAgICAgIGxldmVscz1jKCJzdG9wLCB2b2ljZWQiLCAic3RvcCwgdm9pY2VsZXNzIiwgImZyaWNhdGl2ZSwgdm9pY2VkIiwgImZyaWNhdGl2ZSwgdm9pY2VsZXNzIiwgIm5hc2FsIikpLCAKICAgIHBsYWNlPWZhY3RvcihtYXBfY2hyKGluaXRpYWxfQywgZ2V0X3BsYWNlKSwgCiAgICAgICAgICAgICAgICAgbGV2ZWxzPWMoImJpbGFiaWFsIiwgImFsdmVvbGFyIiwgInZlbGFyIGV0Yy4iKSkpICU+JQogIGdncGxvdChhZXMoeD15ZWFyLCB5PXRvdGFsLCBjb2xvdXI9c2V4KSkgKyBnZW9tX2xpbmUoKSArIGZhY2V0X2dyaWQobWFubmVyIH4gcGxhY2UpICsgZ2VvbV90ZXh0KHg9MTkwMCwgeT0yMDAwMDAsIGNvbG91cj0iYmxhY2siLCBhZXMobGFiZWw9aW5pdGlhbF9DKSkgKyBsYWJzKHRpdGxlPSJQb3B1bGFyaXR5IG9mIHNlbGVjdGVkIGluaXRpYWwgY29uc29uYW50cyBpbiBnaXZlbiBuYW1lcyIpCmBgYAoKIyMgTWFwcGluZyBmdW5jdGlvbnMKClByb2R1Y2UgYSB2ZWN0b3IgZnJvbSBhIGZ1bmN0aW9uCgotIG1hcF9pbnQKLSBtYXBfZGJsCi0gKiptYXBfY2hyKioKClRha2UgbmFtZXMgYW5kIGNhcGl0YWxpc2UgYWxsIHRoZSB2b3dlbC1maW5hbAoKYGBge3J9CnZmaW5hbDJjYXAgPC0gZnVuY3Rpb24od29yZCl7CiAgaWYgKHN0cl9kZXRlY3Qod29yZCwgIlthZWlvdXldJCIpKXsKICAgIHJldHVybihzdHJfdG9fdXBwZXIod29yZCkpCiAgfSBlbHNlIHtyZXR1cm4oc3RyX3RvX2xvd2VyKHdvcmQpKX0KfQp2ZmluYWwyY2FwKCJFbW1hIikKdmZpbmFsMmNhcCgiQm9iIikKYmFieW5hbWVzICU+JSBwdWxsKG5hbWUpICU+JSB1bmlxdWUoKSAtPiBhbGxfbmFtZXMKbWFwX2NocihhbGxfbmFtZXMsIHZmaW5hbDJjYXApICU+JSBoZWFkKDI0KQpgYGAKCgoKCgoK