README

fuzzystring provides fast, flexible fuzzy string joins for data frames using approximate string matching. Built on top of data.table and stringdist, it’s designed for efficiently merging datasets where exact matches aren’t possible due to misspellings, inconsistent formatting, or slight variations in text.

Why fuzzystring?

Installation

# Development version from GitHub
# Using pak (recommended)
pak::pak("PaulESantos/fuzzystring")

# Or using remotes
remotes::install_github("PaulESantos/fuzzystring")

Quick start

library(fuzzystring)

# Your messy data
x <- data.frame(
  name = c("Idea", "Premiom", "Very Good"), 
  id = 1:3
)

# Reference data
y <- data.frame(
  approx_name = c("Ideal", "Premium", "VeryGood"), 
  grp = c("A", "B", "C")
)

# Fuzzy join with max distance of 2 edits
fuzzystring_inner_join(
  x, y,
  by = c(name = "approx_name"),
  max_dist = 2,
  distance_col = "distance"
)
#>      name id approx_name grp distance
#> 1:   Idea  1       Ideal   A        1
#> 2: Premiom  2     Premium   B        2

Key features

All join types supported

# Inner join - only matching rows
fuzzystring_inner_join(x, y, by = c(name = "approx_name"), max_dist = 2)

# Left join - all rows from x, matching rows from y
fuzzystring_left_join(x, y, by = c(name = "approx_name"), max_dist = 2)

# Right join - all rows from y, matching rows from x
fuzzystring_right_join(x, y, by = c(name = "approx_name"), max_dist = 2)

# Full join - all rows from both tables
fuzzystring_full_join(x, y, by = c(name = "approx_name"), max_dist = 2)

# Semi join - rows from x that have a match in y
fuzzystring_semi_join(x, y, by = c(name = "approx_name"), max_dist = 2)

# Anti join - rows from x that don't have a match in y
fuzzystring_anti_join(x, y, by = c(name = "approx_name"), max_dist = 2)

Multiple distance methods

# Optimal String Alignment (default)
fuzzystring_inner_join(x, y, by = c(name = "approx_name"), method = "osa")

# Damerau-Levenshtein
fuzzystring_inner_join(x, y, by = c(name = "approx_name"), method = "dl")

# Jaro-Winkler (good for names)
fuzzystring_inner_join(x, y, by = c(name = "approx_name"), method = "jw")

# Soundex (phonetic matching)
fuzzystring_inner_join(x, y, by = c(name = "approx_name"), method = "soundex")

Case-insensitive matching

fuzzystring_inner_join(
  x, y, 
  by = c(name = "approx_name"),
  ignore_case = TRUE,
  max_dist = 1
)

Extended example: Correcting misspellings

A common use case is matching misspelled words against a dictionary. The package includes a dataset of 4,500+ common misspellings from Wikipedia.

library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.4     ✔ readr     2.1.6
#> ✔ forcats   1.0.1     ✔ stringr   1.6.0
#> ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
#> ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
#> ✔ purrr     1.2.1     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fuzzystring)

# Load misspellings dataset
data(misspellings)
misspellings
#> # A tibble: 4,505 × 2
#>    misspelling correct   
#>    <chr>       <chr>     
#>  1 abandonned  abandoned 
#>  2 aberation   aberration
#>  3 abilties    abilities 
#>  4 abilty      ability   
#>  5 abondon     abandon   
#>  6 abbout      about     
#>  7 abotu       about     
#>  8 abouta      about a   
#>  9 aboutit     about it  
#> 10 aboutthe    about the 
#> # ℹ 4,495 more rows

# Dictionary from qdapDictionaries package
library(qdapDictionaries)
words <- dplyr::as_tibble(DICTIONARY)
words
#> # A tibble: 20,137 × 2
#>    word  syllables
#>    <chr>     <dbl>
#>  1 hm            1
#>  2 hmm           1
#>  3 hmmm          1
#>  4 hmph          1
#>  5 mmhmm         2
#>  6 mmhm          2
#>  7 mm            1
#>  8 mmm           1
#>  9 mmmm          1
#> 10 pff           1
#> # ℹ 20,127 more rows

set.seed(2016)
sub_misspellings <- misspellings %>%
  sample_n(1000)

# Fuzzy join with max distance of 1
joined <- sub_misspellings %>%
  fuzzystring_inner_join(
    words, 
    by = c(misspelling = "word"), 
    max_dist = 1
  )

joined
#>      misspelling    correct       word syllables
#>           <char>     <char>     <char>     <num>
#>   1:   cyclinder   cylinder   cylinder         3
#>   2: beastiality bestiality bestiality         5
#>   3:    affilate  affiliate  affiliate         4
#>   4:     supress   suppress   suppress         2
#>   5:    intevene  intervene  intervene         3
#>  ---                                            
#> 756:        fiel      field       file         1
#> 757:        fiel      field       fill         1
#> 758:        fiel      field       fuel         2
#> 759:        fiel      field       riel         2
#> 760:    aparment  apartment  apartment         3

Analyzing match quality

# Count unique misspellings matched
joined %>%
  count(misspelling, correct)
#>      misspelling    correct     n
#>           <char>     <char> <int>
#>   1:      abilty    ability     1
#>   2:   accademic   academic     1
#>   3:    accademy    academy     1
#>   4:   accension  accession     2
#>   5:  acceptence acceptance     1
#>  ---                             
#> 458:        wnat       want     3
#> 459:  wonderfull  wonderful     1
#> 460:       wroet      wrote     1
#> 461:     wupport    support     1
#> 462:      zeebra      zebra     1

# Calculate accuracy
which_correct <- joined %>%
  group_by(misspelling, correct) %>%
  summarize(
    guesses = n(), 
    one_correct = any(correct == word),
    .groups = "drop"
  )

# Percentage getting at least one correct guess
mean(which_correct$one_correct)
#> [1] 0.8246753

# Number with exactly one correct match
sum(which_correct$guesses == 1 & which_correct$one_correct)
#> [1] 290

Including distance metrics

joined_dists <- sub_misspellings %>%
  fuzzystring_inner_join(
    words, 
    by = c(misspelling = "word"), 
    max_dist = 2,
    distance_col = "distance"
  )

# Find the closest match for each misspelling
closest <- joined_dists %>%
  group_by(misspelling) %>%
  top_n(1, desc(distance)) %>%
  ungroup()

# Distribution of distances
closest %>%
  count(distance)
#> # A tibble: 3 × 2
#>   distance     n
#>      <dbl> <int>
#> 1        0     3
#> 2        1   739
#> 3        2   594

Using different join types

left_joined <- sub_misspellings %>%
  fuzzystring_left_join(
    words, 
    by = c(misspelling = "word"), 
    max_dist = 1
  )

# Show words we couldn't match
left_joined %>%
  filter(is.na(word))
#>        misspelling       correct   word syllables
#>             <char>        <char> <char>     <num>
#>   1:     Sanhedrim     Sanhedrin   <NA>        NA
#>   2: consicousness consciousness   <NA>        NA
#>   3:    repubicans   republicans   <NA>        NA
#>   4:      comitted     committed   <NA>        NA
#>   5:     emmisions     emissions   <NA>        NA
#>  ---                                             
#> 534:   accquainted    acquainted   <NA>        NA
#> 535:   ubiquitious    ubiquitous   <NA>        NA
#> 536:       Januray       January   <NA>        NA
#> 537:   aggaravates    aggravates   <NA>        NA
#> 538:       wayword       wayward   <NA>        NA

unmatched <- sub_misspellings %>%
  fuzzystring_anti_join(
    words, 
    by = c(misspelling = "word"), 
    max_dist = 1
  )

unmatched
#>        misspelling       correct
#>             <char>        <char>
#>   1:     Sanhedrim     Sanhedrin
#>   2: consicousness consciousness
#>   3:    repubicans   republicans
#>   4:      comitted     committed
#>   5:     emmisions     emissions
#>  ---                            
#> 534:   accquainted    acquainted
#> 535:   ubiquitious    ubiquitous
#> 536:       Januray       January
#> 537:   aggaravates    aggravates
#> 538:       wayword       wayward

Performance

fuzzystring uses a C++ implementation for row binding combined with a data.table backend for fast performance on large datasets. The package is optimized for:

Advanced usage

Multiple column joins

# Match on both string similarity and numeric proximity
fuzzystring_inner_join(
  x, y,
  by = c(name = "approx_name", value = "approx_value"),
  match_fun = list(
    name = function(x, y) stringdist::stringdist(x, y) <= 1,
    value = function(x, y) abs(x - y) < 0.5
  )
)

Custom distance functions

# Use custom matching logic
fstring_inner_join(
  x, y,
  by = c(name = "approx_name"),
  match_fun = function(x, y) {
    # Your custom matching logic
    result <- your_distance_function(x, y)
    result <= threshold
  }
)

Related packages

Credits

fuzzystring builds on these great packages to provide a fast, data.table-native implementation with C++ optimization.