diff --git a/R/fuzzy_match.R b/R/fuzzy_match.R index 65d4f4cf..e34a8df7 100644 --- a/R/fuzzy_match.R +++ b/R/fuzzy_match.R @@ -49,6 +49,14 @@ fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel, txt_word3_start <- stringr::str_extract(word(txt,3), "[:alpha:]|[:digit:]") } + ## subset accepted list to taxa that begin with the same first letter to reduce the number of fuzzy matches that are made in the next step. + accepted_list <- accepted_list[(stringr::str_extract(accepted_list, "[:alpha:]") %>% stringr::str_to_lower() == txt_word1_start %>% stringr::str_to_lower())] + + ## to further speed up matches, further reduce list by subsetting accepted list based on the first letter of the second word. + if(words_in_text > 1) { + accepted_list <- accepted_list[(stringr::str_extract(stringr::word(accepted_list,2), "[:alpha:]") %>% stringr::str_to_lower() == txt_word2_start %>% stringr::str_to_lower())] + } + ## identify the number of characters that must change for the text string to match each of the possible accepted names distance_c <- utils::adist(txt, accepted_list, fixed=TRUE)[1,]