Skip to content

Commit

Permalink
Fix & improve FTS. Implements #552 (#556)
Browse files Browse the repository at this point in the history
Improve free text search by:
 * Fixing n-gram generation when a space is added in front of a word
 * give 4-grams a higher weight than multiple matches of the same 3-gram by not counting duplicate n-grams (as a result the score can not be >1 anymore)
 * lower the min-score to 0.33 which means at least 1 out of 3 n-grams must match in order for a task to be considered a result

The changes are supposed to favor longer matches over many shorter matches.
  • Loading branch information
dmfs authored Dec 7, 2017
1 parent aa955cd commit 04558da
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ public final class NGramGenerator
private boolean mAddSpaceInFront = false;
private Locale mLocale = Locale.getDefault();

private char[] mTempArray;


public NGramGenerator(int n)
{
Expand All @@ -59,8 +57,6 @@ public NGramGenerator(int n, int minWordLen)
{
mN = n;
mMinWordLen = minWordLen;
mTempArray = new char[n];
mTempArray[0] = ' ';
}


Expand Down Expand Up @@ -159,12 +155,11 @@ public Set<String> getNgrams(Set<String> set, String data)
}


public void getNgrams(String word, Set<String> ngrams)
private void getNgrams(String word, Set<String> ngrams)
{
final int len = word.length();
final int minWordLen = mMinWordLen;

if (len < minWordLen)
if (len < mMinWordLen)
{
return;
}
Expand All @@ -181,21 +176,8 @@ public void getNgrams(String word, Set<String> ngrams)
{
/*
* Add another String with a space and the first n-1 characters of the word.
*
* We could just call
*
* ngrams.add(" " + word.substring(0, Math.min(len, n - 1));
*
* But it's probably way more efficient like this:
*/
char[] tempArray = mTempArray;

int count = Math.min(len, n - 1);
for (int i = 0; i < count; ++i)
{
tempArray[i + 1] = word.charAt(i);
}
ngrams.add(new String(tempArray));
ngrams.add(" " + word.substring(0, Math.min(len, n - 1)));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,15 @@


/**
* Supports the {@link TaskDatabaseHelper} in the manner of full-text-search.
* Supports the {@link TaskDatabaseHelper} in the matter of full-text-search.
*
* @author Tobias Reinsch <tobias@dmfs.org>
* @author Marten Gajda <marten@dmfs.org>
*/
public class FTSDatabaseHelper
{

private final static float SEARCH_RESULTS_MIN_SCORE = 0.4f;
private final static float SEARCH_RESULTS_MIN_SCORE = 0.33f;

/**
* A Generator for 3-grams.
Expand Down Expand Up @@ -127,7 +127,7 @@ public interface NGramColumns
+ " Integer PRIMARY KEY AUTOINCREMENT, " + NGramColumns.TEXT + " Text)";

// FIXME: at present the minimum score is hard coded can we leave that decision to the caller?
private final static String SQL_RAW_QUERY_SEARCH_TASK = "SELECT %s " + ", min(1.0*count(*)/?, 1.0) as " + TaskContract.Tasks.SCORE + " from "
private final static String SQL_RAW_QUERY_SEARCH_TASK = "SELECT %s " + ", (1.0*count(DISTINCT " + NGramColumns.NGRAM_ID + ")/?) as " + TaskContract.Tasks.SCORE + " from "
+ FTS_NGRAM_TABLE + " join " + FTS_CONTENT_TABLE + " on (" + FTS_NGRAM_TABLE + "." + NGramColumns.NGRAM_ID + "=" + FTS_CONTENT_TABLE + "."
+ FTSContentColumns.NGRAM_ID + ") join " + Tables.INSTANCE_VIEW + " on (" + Tables.INSTANCE_VIEW + "." + TaskContract.Instances.TASK_ID + " = " + FTS_CONTENT_TABLE + "."
+ FTSContentColumns.TASK_ID + ") where %s group by " + TaskContract.Instances.TASK_ID + " having " + TaskContract.Tasks.SCORE + " >= " + SEARCH_RESULTS_MIN_SCORE
Expand Down

0 comments on commit 04558da

Please sign in to comment.