Skip to content

Commit

Permalink
Update fuzzy_match.R
Browse files Browse the repository at this point in the history
* update fuzzy match algorithm to cycle through multiple "same distance" matches until one passes the "first letter" rules. This was includes, because found an instance where there was an equal closest match that was a completely different genus and because multiple matches the fuzzy matches all returned NA. This will also mean that if there are multiple equally good matches it will align with the first.
  • Loading branch information
ehwenk committed Mar 11, 2024
1 parent 3fcc6a3 commit 9e943c8
Showing 1 changed file with 62 additions and 45 deletions.
107 changes: 62 additions & 45 deletions R/fuzzy_match.R
Original file line number Diff line number Diff line change
Expand Up @@ -57,66 +57,83 @@ fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel,
min_dist_per_c <- min(distance_c) / stringr::str_length(txt)

i <- which(distance_c==min_dist_abs_c)
keep = FALSE

if(
## Within allowable number of characters (absolute)
min_dist_abs_c <= max_distance_abs &
## Within allowable number of characters (relative)
min_dist_per_c <= max_distance_rel &
min_dist_per_c <= max_distance_rel #&
## Is a unique solution
length(i)<= n_allowed
) {
## identify number of words in the matched string
words_in_match <- 1 + stringr::str_count(accepted_list[i]," ")
#length(i) <= n_allowed
) {

## identify the first letter of the first word in the matched string
match_word1_start <- stringr::str_extract(accepted_list[i], "[:alpha:]")

## identify the first letter of the second word in the matched string (if the matched string includes 2+ words)
if(words_in_text > 1 & epithet_letters == 2) {
if(nchar(word(accepted_list[i],2)) == 1) {
match_word2_start <- stringr::str_extract(word(accepted_list[i],2), "[:alpha:]|[:digit:]")
for (j in 1:length(i)) {

if (keep == TRUE) {

break()

} else {
match_word2_start <- stringr::str_extract(word(accepted_list[i],2), "[:alpha:][:alpha:]|[:digit:]")
}
}
## identify number of words in the matched string
words_in_match <- 1 + stringr::str_count(accepted_list[i][j]," ")

## identify the first letter of the first word in the matched string
match_word1_start <- stringr::str_extract(accepted_list[i][j], "[:alpha:]")

## identify the first letter of the second word in the matched string (if the matched string includes 2+ words)
if(words_in_text > 1 & epithet_letters == 2) {
if(nchar(word(accepted_list[i][j],2)) == 1) {
match_word2_start <- stringr::str_extract(word(accepted_list[i][j],2), "[:alpha:]|[:digit:]")
} else {
match_word2_start <- stringr::str_extract(word(accepted_list[i][j],2), "[:alpha:][:alpha:]|[:digit:]")
}
}

if(words_in_text > 1 & epithet_letters == 1) {
match_word2_start <- stringr::str_extract(word(accepted_list[i][j],2), "[:alpha:]|[:digit:]")
}

if(words_in_text > 1 & epithet_letters == 1) {
match_word2_start <- stringr::str_extract(word(accepted_list[i],2), "[:alpha:]|[:digit:]")
}

## identify the first letter of the third word in the matched string (if the matched string includes 3+ words)
if(words_in_text > 2) {
match_word3_start <- stringr::str_extract(word(accepted_list[i],3), "[:alpha:]|[:digit:]")
}
## identify the first letter of the third word in the matched string (if the matched string includes 3+ words)
if(words_in_text > 2) {
match_word3_start <- stringr::str_extract(word(accepted_list[i][j],3), "[:alpha:]|[:digit:]")
}

## keep match if the first letters of the first three words (or fewer if applicable) in the string to match
## are identical to the first letters of the first three words in the matched string

keep = FALSE
if(words_in_text == 1) {
if (txt_word1_start == match_word1_start) {
keep = TRUE }

} else if(words_in_text == 2) {
if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) {
keep = TRUE }

} else if(words_in_text > 2) {
if (words_in_match > 2) {
if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start & txt_word3_start == match_word3_start) {
keep = TRUE }
} else if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) {
keep = TRUE }
}

## keep match if the first letters of the first three words (or fewer if applicable) in the string to match
## are identical to the first letters of the first three words in the matched string

if(keep == TRUE) {

return(accepted_list[i][j])

}

return(NA)
}

if(words_in_text == 1) {
if (txt_word1_start == match_word1_start) {
keep = TRUE }

} else if(words_in_text == 2) {
if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) {
keep = TRUE }

} else if(words_in_text > 2) {
if (words_in_match > 2) {
if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start & txt_word3_start == match_word3_start) {
keep = TRUE }
} else if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) {
keep = TRUE }
return(NA)
}

if(keep == TRUE) {

return(accepted_list[i])

}
return(NA)
}

return(NA)
}

0 comments on commit 9e943c8

Please sign in to comment.