
fuzzystring provides fast, flexible fuzzy string
joins for data frames using approximate string matching. Built on top of
data.table and stringdist, it’s designed for
efficiently merging datasets where exact matches aren’t possible due to
misspellings, inconsistent formatting, or slight variations in text.
Real-world data is messy. You might need to:
fuzzystring makes these tasks straightforward with:
data.table backend for speed# Development version from GitHub
# Using pak (recommended)
pak::pak("PaulESantos/fuzzystring")
# Or using remotes
remotes::install_github("PaulESantos/fuzzystring")Here’s a simple example matching diamond cuts with slight misspellings:
library(fuzzystring)
# Your messy data
x <- data.frame(
name = c("Idea", "Premiom", "Very Good"),
id = 1:3
)
# Reference data
y <- data.frame(
approx_name = c("Ideal", "Premium", "VeryGood"),
grp = c("A", "B", "C")
)
# Fuzzy join with max distance of 2 edits
fuzzystring_inner_join(
x, y,
by = c(name = "approx_name"),
max_dist = 2,
distance_col = "distance"
)
#> name id approx_name grp distance
#> 1: Idea 1 Ideal A 1
#> 2: Premiom 2 Premium B 2# Inner join - only matching rows
fuzzystring_inner_join(x, y, by = c(name = "approx_name"), max_dist = 2)
# Left join - all rows from x, matching rows from y
fuzzystring_left_join(x, y, by = c(name = "approx_name"), max_dist = 2)
# Right join - all rows from y, matching rows from x
fuzzystring_right_join(x, y, by = c(name = "approx_name"), max_dist = 2)
# Full join - all rows from both tables
fuzzystring_full_join(x, y, by = c(name = "approx_name"), max_dist = 2)
# Semi join - rows from x that have a match in y
fuzzystring_semi_join(x, y, by = c(name = "approx_name"), max_dist = 2)
# Anti join - rows from x that don't have a match in y
fuzzystring_anti_join(x, y, by = c(name = "approx_name"), max_dist = 2)# Optimal String Alignment (default)
fuzzystring_inner_join(x, y, by = c(name = "approx_name"), method = "osa")
# Damerau-Levenshtein
fuzzystring_inner_join(x, y, by = c(name = "approx_name"), method = "dl")
# Jaro-Winkler (good for names)
fuzzystring_inner_join(x, y, by = c(name = "approx_name"), method = "jw")
# Soundex (phonetic matching)
fuzzystring_inner_join(x, y, by = c(name = "approx_name"), method = "soundex")fuzzystring_inner_join(
x, y,
by = c(name = "approx_name"),
ignore_case = TRUE,
max_dist = 1
)A common use case is matching misspelled words against a dictionary. The package includes a dataset of 4,500+ common misspellings from Wikipedia.
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr 1.1.4 ✔ readr 2.1.6
#> ✔ forcats 1.0.1 ✔ stringr 1.6.0
#> ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
#> ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
#> ✔ purrr 1.2.1
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fuzzystring)
# Load misspellings dataset
data(misspellings)
misspellings
#> # A tibble: 4,505 × 2
#> misspelling correct
#> <chr> <chr>
#> 1 abandonned abandoned
#> 2 aberation aberration
#> 3 abilties abilities
#> 4 abilty ability
#> 5 abondon abandon
#> 6 abbout about
#> 7 abotu about
#> 8 abouta about a
#> 9 aboutit about it
#> 10 aboutthe about the
#> # ℹ 4,495 more rowsLet’s match these against a real dictionary:
# Dictionary from qdapDictionaries package
library(qdapDictionaries)
words <- dplyr::as_tibble(DICTIONARY)
words
#> # A tibble: 20,137 × 2
#> word syllables
#> <chr> <dbl>
#> 1 hm 1
#> 2 hmm 1
#> 3 hmmm 1
#> 4 hmph 1
#> 5 mmhmm 2
#> 6 mmhm 2
#> 7 mm 1
#> 8 mmm 1
#> 9 mmmm 1
#> 10 pff 1
#> # ℹ 20,127 more rowsSample 1,000 misspellings and find their matches:
set.seed(2016)
sub_misspellings <- misspellings %>%
sample_n(1000)
# Fuzzy join with max distance of 1
joined <- sub_misspellings %>%
fuzzystring_inner_join(
words,
by = c(misspelling = "word"),
max_dist = 1
)
joined
#> misspelling correct word syllables
#> <char> <char> <char> <num>
#> 1: cyclinder cylinder cylinder 3
#> 2: beastiality bestiality bestiality 5
#> 3: affilate affiliate affiliate 4
#> 4: supress suppress suppress 2
#> 5: intevene intervene intervene 3
#> ---
#> 756: fiel field file 1
#> 757: fiel field fill 1
#> 758: fiel field fuel 2
#> 759: fiel field riel 2
#> 760: aparment apartment apartment 3Check how many misspellings we successfully matched:
# Count unique misspellings matched
joined %>%
count(misspelling, correct)
#> misspelling correct n
#> <char> <char> <int>
#> 1: abilty ability 1
#> 2: accademic academic 1
#> 3: accademy academy 1
#> 4: accension accession 2
#> 5: acceptence acceptance 1
#> ---
#> 458: wnat want 3
#> 459: wonderfull wonderful 1
#> 460: wroet wrote 1
#> 461: wupport support 1
#> 462: zeebra zebra 1
# Calculate accuracy
which_correct <- joined %>%
group_by(misspelling, correct) %>%
summarize(
guesses = n(),
one_correct = any(correct == word),
.groups = "drop"
)
# Percentage getting at least one correct guess
mean(which_correct$one_correct)
#> [1] 0.8246753
# Number with exactly one correct match
sum(which_correct$guesses == 1 & which_correct$one_correct)
#> [1] 290Add a distance column to see how far apart the matches are:
joined_dists <- sub_misspellings %>%
fuzzystring_inner_join(
words,
by = c(misspelling = "word"),
max_dist = 2,
distance_col = "distance"
)
# Find the closest match for each misspelling
closest <- joined_dists %>%
group_by(misspelling) %>%
top_n(1, desc(distance)) %>%
ungroup()
# Distribution of distances
closest %>%
count(distance)
#> # A tibble: 3 × 2
#> distance n
#> <dbl> <int>
#> 1 0 3
#> 2 1 739
#> 3 2 594Left join to keep unmatched words:
left_joined <- sub_misspellings %>%
fuzzystring_left_join(
words,
by = c(misspelling = "word"),
max_dist = 1
)
# Show words we couldn't match
left_joined %>%
filter(is.na(word))
#> misspelling correct word syllables
#> <char> <char> <char> <num>
#> 1: Sanhedrim Sanhedrin <NA> NA
#> 2: consicousness consciousness <NA> NA
#> 3: repubicans republicans <NA> NA
#> 4: comitted committed <NA> NA
#> 5: emmisions emissions <NA> NA
#> ---
#> 534: accquainted acquainted <NA> NA
#> 535: ubiquitious ubiquitous <NA> NA
#> 536: Januray January <NA> NA
#> 537: aggaravates aggravates <NA> NA
#> 538: wayword wayward <NA> NAAnti join to find only unmatched words:
unmatched <- sub_misspellings %>%
fuzzystring_anti_join(
words,
by = c(misspelling = "word"),
max_dist = 1
)
unmatched
#> misspelling correct
#> <char> <char>
#> 1: Sanhedrim Sanhedrin
#> 2: consicousness consciousness
#> 3: repubicans republicans
#> 4: comitted committed
#> 5: emmisions emissions
#> ---
#> 534: accquainted acquainted
#> 535: ubiquitious ubiquitous
#> 536: Januray January
#> 537: aggaravates aggravates
#> 538: wayword waywardfuzzystring uses a C++ implementation for row
binding combined with a data.table backend for fast
performance on large datasets. The package is optimized for:
# Match on both string similarity and numeric proximity
fuzzystring_inner_join(
x, y,
by = c(name = "approx_name", value = "approx_value"),
match_fun = list(
name = function(x, y) stringdist::stringdist(x, y) <= 1,
value = function(x, y) abs(x - y) < 0.5
)
)# Use custom matching logic
fstring_inner_join(
x, y,
by = c(name = "approx_name"),
match_fun = function(x, y) {
# Your custom matching logic
result <- your_distance_function(x, y)
result <= threshold
}
)fuzzyjoin package, which served as the base for
fuzzystring; some core fuzzyjoin functions
were adapted and reinterpreted using data.table.fuzzystring builds on these great packages to provide a fast, data.table-native implementation with C++ optimization.