Update fuzzy_match.R

* update fuzzy match algorithm to cycle through multiple "same distance" matches until one passes the "first letter" rules. This was includes, because found an instance where there was an equal closest match that was a completely different genus and because multiple matches the fuzzy matches all returned NA. This will also mean that if there are multiple equally good matches it will align with the first.
traitecoevo · Mar 11, 2024 · 9e943c8 · 9e943c8
1 parent 3fcc6a3
commit 9e943c8
Showing 1 changed file with 62 additions and 45 deletions.
diff --git a/R/fuzzy_match.R b/R/fuzzy_match.R
@@ -57,66 +57,83 @@ fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel,
   min_dist_per_c <-  min(distance_c) / stringr::str_length(txt)
 
   i <- which(distance_c==min_dist_abs_c)
+  keep = FALSE
 
   if(
     ## Within allowable number of characters (absolute)
     min_dist_abs_c <= max_distance_abs &
     ## Within allowable number of characters (relative)
-    min_dist_per_c <= max_distance_rel &
+    min_dist_per_c <= max_distance_rel #&
     ## Is a unique solution
-    length(i)<= n_allowed
-  ) {
-    ## identify number of words in the matched string
-    words_in_match <- 1 + stringr::str_count(accepted_list[i]," ")
+    #length(i) <= n_allowed
+    ) {
 
-    ## identify the first letter of the first word in the matched string
-    match_word1_start <- stringr::str_extract(accepted_list[i], "[:alpha:]")
-
-    ## identify the first letter of the second word in the matched string (if the matched string includes 2+ words)
-    if(words_in_text > 1 & epithet_letters == 2) {
-      if(nchar(word(accepted_list[i],2)) == 1) {
-        match_word2_start <- stringr::str_extract(word(accepted_list[i],2), "[:alpha:]|[:digit:]")
+    for (j in 1:length(i)) {
+
+      if (keep == TRUE) {
+
+        break()
+
       } else {
-        match_word2_start <- stringr::str_extract(word(accepted_list[i],2), "[:alpha:][:alpha:]|[:digit:]")
-      }
-    }
+        ## identify number of words in the matched string
+        words_in_match <- 1 + stringr::str_count(accepted_list[i][j]," ")
+
+        ## identify the first letter of the first word in the matched string
+        match_word1_start <- stringr::str_extract(accepted_list[i][j], "[:alpha:]")
+
+        ## identify the first letter of the second word in the matched string (if the matched string includes 2+ words)
+        if(words_in_text > 1 & epithet_letters == 2) {
+          if(nchar(word(accepted_list[i][j],2)) == 1) {
+            match_word2_start <- stringr::str_extract(word(accepted_list[i][j],2), "[:alpha:]|[:digit:]")
+          } else {
+            match_word2_start <- stringr::str_extract(word(accepted_list[i][j],2), "[:alpha:][:alpha:]|[:digit:]")
+          }
+        }
+
+        if(words_in_text > 1 & epithet_letters == 1) {
+            match_word2_start <- stringr::str_extract(word(accepted_list[i][j],2), "[:alpha:]|[:digit:]")
+        }
 
-    if(words_in_text > 1 & epithet_letters == 1) {
-        match_word2_start <- stringr::str_extract(word(accepted_list[i],2), "[:alpha:]|[:digit:]")
-    }
-
-    ## identify the first letter of the third word in the matched string (if the matched string includes 3+ words)
-    if(words_in_text > 2) {
-      match_word3_start <- stringr::str_extract(word(accepted_list[i],3), "[:alpha:]|[:digit:]")
-    }
+        ## identify the first letter of the third word in the matched string (if the matched string includes 3+ words)
+        if(words_in_text > 2) {
+          match_word3_start <- stringr::str_extract(word(accepted_list[i][j],3), "[:alpha:]|[:digit:]")
+        }
+
+        ## keep match if the first letters of the first three words (or fewer if applicable) in the string to match 
+        ## are identical to the first letters of the first three words in the matched string
 
-    keep = FALSE
+        if(words_in_text == 1) {
+          if (txt_word1_start == match_word1_start) {
+            keep = TRUE }
+
+        } else if(words_in_text == 2) {
+          if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) {
+            keep = TRUE }
+
+        } else if(words_in_text > 2) {
+          if (words_in_match > 2) {
+            if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start & txt_word3_start == match_word3_start) {
+              keep = TRUE }
+          } else if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) {
+            keep = TRUE }
+        }
 
-    ## keep match if the first letters of the first three words (or fewer if applicable) in the string to match 
-    ## are identical to the first letters of the first three words in the matched string
+
+        if(keep == TRUE) {
+
+          return(accepted_list[i][j])
+
+        }
+
+        return(NA)
+      }
 
-    if(words_in_text == 1) {
-      if (txt_word1_start == match_word1_start) {
-        keep = TRUE }
-
-    } else if(words_in_text == 2) {
-      if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) {
-        keep = TRUE }
-
-    } else if(words_in_text > 2) {
-      if (words_in_match > 2) {
-        if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start & txt_word3_start == match_word3_start) {
-          keep = TRUE }
-      } else if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) {
-        keep = TRUE }
+      return(NA)
     }
 
-    if(keep == TRUE) {
-
-      return(accepted_list[i])
-
-    }
     return(NA)
   }
+
   return(NA)
 }
+