The word list comes from
this JS
file. Look for var Ma=
to see the list. You can see the full code
here
library(tidyverse)
library(glue)
library(gt)
library(scales)
library(simplecolors)
# read in NYT data
raw_words <-
read_csv("input/nyt_word_list.csv") |>
as_tibble() |>
print()
# A tibble: 2,309 x 1
word
<chr>
1 cigar
2 rebut
3 sissy
4 humph
5 awake
# ... with 2,304 more rows
# Break out each letter into its own column
word_letters <-
raw_words |>
separate(
col = word,
sep = "",
into = paste0("x", 0:5),
remove = FALSE
) |>
select(-x0) |>
print()
# A tibble: 2,309 x 6
word x1 x2 x3 x4 x5
<chr> <chr> <chr> <chr> <chr> <chr>
1 cigar c i g a r
2 rebut r e b u t
3 sissy s i s s y
4 humph h u m p h
5 awake a w a k e
# ... with 2,304 more rows
# Pivot the data to long-form and tally up frequencies
word_letters_long <-
word_letters |>
pivot_longer(
cols = starts_with("x"),
names_to = "position",
values_to = "letter"
) |>
# times letter was used at all
add_count(letter, name = "letter_freq") |>
# times letter was used in position
add_count(letter, position, name = "position_freq") |>
print()
# A tibble: 11,545 x 5
word position letter letter_freq position_freq
<chr> <chr> <chr> <int> <int>
1 cigar x1 c 475 198
2 cigar x2 i 670 201
3 cigar x3 g 310 67
4 cigar x4 a 975 162
5 cigar x5 r 897 212
# ... with 11,540 more rows
# Pivot wider
word_letter_freq <-
word_letters_long |>
select(word, pos = letter, position, freq = position_freq) |>
pivot_wider(
names_from = position,
values_from = c(pos, freq)
) |>
rename_all(str_remove_all, "pos_") |>
print()
# A tibble: 2,309 x 11
word x1 x2 x3 x4 x5 freq_x1 freq_x2 freq_x3 freq_x4 freq_x5
<chr> <chr> <chr> <chr> <chr> <chr> <int> <int> <int> <int> <int>
1 cigar c i g a r 198 201 67 162 212
2 rebut r e b u t 105 241 56 82 253
3 sissy s i s s y 365 201 80 171 364
4 humph h u m p h 69 185 61 50 137
5 awake a w a k e 140 44 306 55 422
# ... with 2,304 more rows
# Letters by total count and frequency in positions 1:5
letter_stats <-
word_letters_long |>
distinct(letter, letter_freq, position, position_freq) |>
pivot_wider(
names_from = position,
values_from = position_freq,
values_fill = 0
) |>
arrange(desc(letter_freq))
letter | # words | 1 | 2 | 3 | 4 | 5 |
---|---|---|---|---|---|---|
e | 1230 | 72 | 241 | 177 | 318 | 422 |
a | 975 | 140 | 304 | 306 | 162 | 63 |
r | 897 | 105 | 267 | 163 | 150 | 212 |
o | 753 | 41 | 279 | 243 | 132 | 58 |
t | 729 | 149 | 77 | 111 | 139 | 253 |
l | 716 | 87 | 200 | 112 | 162 | 155 |
i | 670 | 34 | 201 | 266 | 158 | 11 |
s | 668 | 365 | 16 | 80 | 171 | 36 |
n | 573 | 37 | 87 | 137 | 182 | 130 |
c | 475 | 198 | 40 | 56 | 150 | 31 |
u | 466 | 33 | 185 | 165 | 82 | 1 |
y | 424 | 6 | 22 | 29 | 3 | 364 |
d | 393 | 111 | 20 | 75 | 69 | 118 |
h | 387 | 69 | 144 | 9 | 28 | 137 |
p | 365 | 141 | 61 | 57 | 50 | 56 |
m | 316 | 107 | 38 | 61 | 68 | 42 |
g | 310 | 115 | 11 | 67 | 76 | 41 |
b | 280 | 173 | 16 | 56 | 24 | 11 |
f | 229 | 135 | 8 | 25 | 35 | 26 |
k | 210 | 20 | 10 | 12 | 55 | 113 |
w | 194 | 82 | 44 | 26 | 25 | 17 |
v | 152 | 43 | 15 | 49 | 45 | 0 |
z | 40 | 3 | 2 | 11 | 20 | 4 |
x | 37 | 0 | 14 | 12 | 3 | 8 |
q | 29 | 23 | 5 | 1 | 0 | 0 |
j | 27 | 20 | 2 | 3 | 2 | 0 |
# Create metrics for each word. letter_stats$letter is a factor so `letters
word_stats <-
word_letters_long |>
mutate(
letter = factor(letter, levels = letter_stats$letter, ordered = TRUE),
letter_ord = as.integer(letter)
) |>
group_by(word) |>
arrange(letter) |>
summarise(
alpha = paste(sort(as.character(letter)), collapse = ""),
n_letter = n_distinct(letter),
n_1_5 = sum(letter %in% letter_stats$letter[1:5]),
n_6_10 = sum(letter %in% letter_stats$letter[6:10]),
n_10_15 = sum(letter %in% letter_stats$letter[10:15]),
n_12_26 = sum(letter %in% letter_stats$letter[12:26]),
# are the letters closest to the top (1:26, lower is better)
mean_letter_rank = mean(letter_ord),
# are the letter closer to their most common positions (higher is better)
mean_pos_freq = mean(position_freq)
#,mean_letter_freq = mean(letter_freq),sum_letter_freq = sum(letter_freq)
) |>
ungroup() |>
arrange(mean_letter_rank) |>
print()
# A tibble: 2,309 x 9
word alpha n_letter n_1_5 n_6_10 n_10_15 n_12_26 mean_letter_rank
<chr> <chr> <int> <int> <int> <int> <int> <dbl>
1 eater aeert 4 5 0 0 0 2.4
2 rarer aerrr 3 5 0 0 0 2.4
3 eerie eeeir 3 4 1 0 0 2.6
4 error eorrr 3 5 0 0 0 2.8
5 terra aerrt 4 5 0 0 0 2.8
# ... with 2,304 more rows, and 1 more variable: mean_pos_freq <dbl>
# find words that have 5 distinct values
distinct_letters <-
word_stats |>
filter(n_letter == 5) |>
print()
# A tibble: 1,562 x 9
word alpha n_letter n_1_5 n_6_10 n_10_15 n_12_26 mean_letter_rank
<chr> <chr> <int> <int> <int> <int> <int> <dbl>
1 alert aelrt 5 4 1 0 0 3.4
2 alter aelrt 5 4 1 0 0 3.4
3 later aelrt 5 4 1 0 0 3.4
4 arose aeors 5 4 1 0 0 3.6
5 irate aeirt 5 4 1 0 0 3.6
# ... with 1,557 more rows, and 1 more variable: mean_pos_freq <dbl>
# roll-up words by patterns (for regex)
as_patterns <-
distinct_letters |>
# roll up words by best placement
group_by(alpha) |>
arrange(desc(mean_pos_freq)) |>
summarise(
words = paste(word, collapse = ","),
mean_letter_rank = mean(mean_letter_rank), # same across all values
across(matches("n_\\d"), mean),
) |>
ungroup() |>
# create pattern column, move to front
# mutate(
# pattern = str_replace_all(alpha, "(.)(?=.)", "\\1|"),
# .before = everything()
# ) |>
arrange(mean_letter_rank) |>
print()
# A tibble: 1,391 x 7
alpha words mean_letter_rank n_1_5 n_6_10 n_10_15 n_12_26
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 aelrt later,alter,alert 3.4 4 1 0 0
2 aeirt irate 3.6 4 1 0 0
3 aeors arose 3.6 4 1 0 0
4 aerst stare 3.8 4 1 0 0
5 acert crate,trace,cater,react 4.2 4 1 1 0
# ... with 1,386 more rows
# best starting words primarily from letters 1-5: later, irate, stare, arose, stare, crate
as_patterns |>
filter(n_1_5 >= 4) |>
head(3)
# A tibble: 3 x 7
alpha words mean_letter_rank n_1_5 n_6_10 n_10_15 n_12_26
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 aelrt later,alter,alert 3.4 4 1 0 0
2 aeirt irate 3.6 4 1 0 0
3 aeors arose 3.6 4 1 0 0
# best 2nd word from 6-10: slain, slice, since, sonic, sling, cling, slink, slick
as_patterns |>
filter(n_6_10 >= 4) |>
head(3)
# A tibble: 3 x 7
alpha words mean_letter_rank n_1_5 n_6_10 n_10_15 n_12_26
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ailns slain,snail 6.4 1 4 0 0
2 ceils slice 6.4 1 4 1 0
3 ceins since 7 1 4 1 0
# best 3rd words from letters 10-15: duchy, dutch, pouch, punch, pushy, chump, dumpy, pudgy
as_patterns |>
filter(n_10_15 >= 4) |>
arrange(desc(n_10_15), mean_letter_rank) |>
head(3)
# A tibble: 3 x 7
alpha words mean_letter_rank n_1_5 n_6_10 n_10_15 n_12_26
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 cdhuy duchy 12 0 1 5 3
2 cdhtu dutch 10.6 1 1 4 2
3 chopu pouch 10.8 1 1 4 2
# best remaining words from rest of alphabet: gawky, maybe, foamy, gumbo, buxom, jumbo
as_patterns |>
filter(n_12_26 >= 3) |>
filter(!str_detect(alpha, "[rtlsncdhp]")) |>
arrange(desc(n_12_26)) |>
head(3)
# A tibble: 3 x 7
alpha words mean_letter_rank n_1_5 n_6_10 n_10_15 n_12_26
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 agkwy gawky 14.4 1 0 1 4
2 abemy maybe 9.8 2 0 1 3
3 afmoy foamy 10.6 2 0 1 3
# Look for next best word once existing letters have been used
find_next <- function(use_pattern, suffix = "") {
as_patterns |>
# see which are used the fewest
mutate(found = str_count(alpha, glue("[{use_pattern}]"))) |>
arrange(found, mean_letter_rank) |>
slice(1) |>
distinct(alpha, words, rank = mean_letter_rank) |>
rename_all(paste0, suffix)
}
# best starting word
find_next(use_pattern = ".")
# A tibble: 1 x 3
alpha words rank
<chr> <chr> <dbl>
1 aelrt later,alter,alert 3.4
# best word after [aelrt] (later, alter, alert)
find_next(use_pattern = "aelrt")
# A tibble: 1 x 3
alpha words rank
<chr> <chr> <dbl>
1 cinos sonic,scion 7.6
# best word after [aelrt] + [cinos] (sonic, scion)
find_next(use_pattern = "aelrtcinos")
# A tibble: 1 x 3
alpha words rank
<chr> <chr> <dbl>
1 dmpuy dumpy 13.4
# Find best sequences
best_sequences <-
as_patterns |>
filter(n_1_5 >= 4) |>
distinct(
alpha_1 = alpha,
words_1 = words,
rank_1 = mean_letter_rank
) |>
mutate(
second = map(
.x = alpha_1,
.f = find_next,
suffix = "_2"
)
) |>
unnest_wider(second) |>
mutate(
third = map(
.x = paste0(alpha_1, alpha_2),
.f = find_next,
suffix = "_3"
)
) |>
unnest_wider(third) |>
mutate(
forth = map(
.x = paste0(alpha_1, alpha_2, alpha_3),
.f = find_next,
suffix = "_4"
)
) |>
unnest_wider(forth) |>
mutate(
n_distinct = map_chr(
.x = paste(alpha_1, alpha_2, alpha_3, alpha_4),
.f =
~ str_split(.x, "") |>
unlist() |>
unique() |>
length()
),
sum_rank = (rank_1 + rank_2 + rank_3)
) |>
arrange(desc(n_distinct), sum_rank)
head(best_sequences, 5)
# A tibble: 5 x 14
alpha_1 words_1 rank_1 alpha_2 words_2 rank_2 alpha_3 words_3 rank_3 alpha_4
<chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
1 adeor adore 4.6 ilntu unlit,~ 7.6 achms chasm 10 agkwy
2 aeors arose 3.6 ilntu unlit,~ 7.6 achmp champ 11.4 abdwy
3 eortu route,ou~ 4.8 ailns slain,~ 6.4 achmp champ 11.4 abdwy
4 eoprt trope 5.6 ailns slain,~ 6.4 cdhuy duchy 12 bgmou
5 enort tenor 4.4 aclsy scaly 7.6 dhimu humid 12.2 agkwy
# ... with 4 more variables: words_4 <chr>, rank_4 <dbl>, n_distinct <chr>,
# sum_rank <dbl>
# Create plot
best_sequences |>
head(10) |>
select(starts_with(words), starts_with("rank")) |>
mutate(order = row_number()) |>
mutate_all(as.character) |>
pivot_longer(
-order,
names_to = c("type", "id"),
names_sep = "_"
) |>
pivot_wider(
names_from = type,
values_from = value
) |>
mutate(across(c(order, id, rank), as.numeric)) |> #print()
ggplot(aes(order, id)) +
geom_tile(aes(fill = rank), alpha = 0.8) +
geom_text(
aes(label = str_replace_all(words, ",", "\n")),
size = 3
) +
scale_fill_viridis_c(
option = "D",
direction = -1
) +
scale_x_continuous(expand = expansion()) +
scale_y_reverse() +
coord_fixed() +
theme(
panel.background = element_rect(fill = "white", color = "white"),
axis.title = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
) +
labs(
title = "Best Plays",
fill = "Letter Rank"
)
# best series: from nytimes js
# adore # arose # route # trope
# until # unlit # slain # slain
# chasm # champ # champ # duchy
# gawky # bawdy # bawdy # gumbo
# best series: from dictionary
# arose # arose
# unlit # clint
# champ # dumpy
# gawky # hawks
# solving wordle
# example: floor - has [lor] but not [ais...], [l] is the 2nd letter
str_detect_each <- function(x, letters) {
pattern <-
str_split(letters, "")[[1]] |>
sort() |>
paste(collapse = "|")
matches <-
str_extract_all(x, pattern) |>
map(pluck) |>
map(sort) |>
map_chr(paste, collapse = "|")
matches == pattern
}
word_stats |>
# green
filter(str_detect(word, "s.a..")) |> # these are filled in
# yellow
filter(str_detect_each(word, "te")) |> # has these other letters
filter(!str_detect(word, "...[t].")) |> # t is not in the 3rd position
# grey
filter(!str_detect(word, "[liopr]")) # none of these letters
# A tibble: 3 x 9
word alpha n_letter n_1_5 n_6_10 n_10_15 n_12_26 mean_letter_rank
<chr> <chr> <int> <int> <int> <int> <int> <dbl>
1 stage aegst 5 3 1 0 1 6.6
2 stake aekst 5 3 1 0 1 7.2
3 stave aestv 5 3 1 0 1 7.6
# ... with 1 more variable: mean_pos_freq <dbl>