Getting Started with fuzzystring
Paul Efren Santos Andrade
2026-02-05
Source:vignettes/getting_started.Rmd
getting_started.RmdIntroduction
fuzzystring provides fast, flexible fuzzy string
joins for data frames using approximate string matching. Built on top of
data.table and stringdist, it’s designed for
efficiently merging datasets where exact matches aren’t possible due to
misspellings, inconsistent formatting, or slight variations in text.
Installation
You can install the development version of fuzzystring from GitHub:
# Using pak (recommended)
# pak::pak("PaulESantos/fuzzystring")
# Or using remotes
# remotes::install_github("PaulESantos/fuzzystring")Quick Start
Here’s a simple example matching diamond cuts with slight misspellings:
# Your messy data
x <- data.frame(
name = c("Idea", "Premiom", "Very Good"),
id = 1:3
)
# Reference data
y <- data.frame(
approx_name = c("Ideal", "Premium", "VeryGood"),
grp = c("A", "B", "C")
)
# Fuzzy join with max distance of 2 edits
fuzzystring_inner_join(
x, y,
by = c(name = "approx_name"),
max_dist = 2,
distance_col = "distance"
)
#> name id approx_name grp distance
#> <char> <int> <char> <char> <num>
#> 1: Idea 1 Ideal A 1
#> 2: Premiom 2 Premium B 1
#> 3: Very Good 3 VeryGood C 1Key Features
All Join Types Supported
fuzzystring supports all standard join types. Below is a small, reusable example dataset so you can compare the behavior of each join family.
x_join <- data.frame(
name = c("Idea", "Premiom", "Very Good", "Gooood"),
id = 1:4
)
y_join <- data.frame(
approx_name = c("Ideal", "Premium", "VeryGood", "Good"),
grp = c("A", "B", "C", "D")
)-
fuzzystring_inner_join(): Only matching rows. -
fuzzystring_left_join(): All rows fromx, matching rows fromy. -
fuzzystring_right_join(): All rows fromy, matching rows fromx. -
fuzzystring_full_join(): All rows from both tables. -
fuzzystring_semi_join(): Rows fromxthat have a match iny. -
fuzzystring_anti_join(): Rows fromxthat don’t have a match iny.
Inner join
fuzzystring_inner_join(
x_join, y_join,
by = c(name = "approx_name"),
max_dist = 2,
distance_col = "distance"
)
#> name id approx_name grp distance
#> <char> <int> <char> <char> <num>
#> 1: Idea 1 Ideal A 1
#> 2: Premiom 2 Premium B 1
#> 3: Very Good 3 VeryGood C 1
#> 4: Gooood 4 Good D 2Left join
fuzzystring_left_join(
x_join, y_join,
by = c(name = "approx_name"),
max_dist = 2,
distance_col = "distance"
)
#> name id approx_name grp distance
#> <char> <int> <char> <char> <num>
#> 1: Idea 1 Ideal A 1
#> 2: Premiom 2 Premium B 1
#> 3: Very Good 3 VeryGood C 1
#> 4: Gooood 4 Good D 2Right join
fuzzystring_right_join(
x_join, y_join,
by = c(name = "approx_name"),
max_dist = 2,
distance_col = "distance"
)
#> name id approx_name grp distance
#> <char> <int> <char> <char> <num>
#> 1: Idea 1 Ideal A 1
#> 2: Premiom 2 Premium B 1
#> 3: Very Good 3 VeryGood C 1
#> 4: Gooood 4 Good D 2Full join
fuzzystring_full_join(
x_join, y_join,
by = c(name = "approx_name"),
max_dist = 2,
distance_col = "distance"
)
#> name id approx_name grp distance
#> <char> <int> <char> <char> <num>
#> 1: Idea 1 Ideal A 1
#> 2: Premiom 2 Premium B 1
#> 3: Very Good 3 VeryGood C 1
#> 4: Gooood 4 Good D 2Semi join (rows from x with a match in
y)
fuzzystring_semi_join(
x_join, y_join,
by = c(name = "approx_name"),
max_dist = 2
)
#> name id
#> <char> <int>
#> 1: Idea 1
#> 2: Premiom 2
#> 3: Very Good 3
#> 4: Gooood 4Anti join (rows from x without a match in
y)
fuzzystring_anti_join(
x_join, y_join,
by = c(name = "approx_name"),
max_dist = 2
)
#> Empty data.table (0 rows and 2 cols): name,idUsing the generic fuzzystring_join()
If you prefer a single entry point, you can use
fuzzystring_join() directly by specifying
mode.
fuzzystring_join(
x_join, y_join,
by = c(name = "approx_name"),
max_dist = 2,
mode = "left",
distance_col = "distance"
)
#> name id approx_name grp distance
#> <char> <int> <char> <char> <num>
#> 1: Idea 1 Ideal A 1
#> 2: Premiom 2 Premium B 1
#> 3: Very Good 3 VeryGood C 1
#> 4: Gooood 4 Good D 2Multiple Distance Methods
You can choose from various distance metrics provided by the
stringdist package:
# Optimal String Alignment (default)
fuzzystring_inner_join(x, y, by = c(name = "approx_name"), method = "osa")
# Damerau-Levenshtein
fuzzystring_inner_join(x, y, by = c(name = "approx_name"), method = "dl")
# Jaro-Winkler (good for names)
fuzzystring_inner_join(x, y, by = c(name = "approx_name"), method = "jw")
# Soundex (phonetic matching)
fuzzystring_inner_join(x, y, by = c(name = "approx_name"), method = "soundex")Case-Insensitive Matching
Use ignore_case = TRUE to ignore capitalization:
fuzzystring_inner_join(
x, y,
by = c(name = "approx_name"),
ignore_case = TRUE,
max_dist = 1
)Advanced Usage
Multiple Column Joins
You can match on multiple columns using different matching functions for each:
fuzzystring_inner_join(
x, y,
by = c(name = "approx_name", value = "approx_value"),
match_fun = list(
name = function(x, y) stringdist::stringdist(x, y) <= 1,
value = function(x, y) abs(x - y) < 0.5
)
)