Skip to content

Commit

Permalink
it works for github sources
Browse files Browse the repository at this point in the history
  • Loading branch information
yujonglee committed Oct 19, 2024
1 parent 273b6d3 commit c91c109
Show file tree
Hide file tree
Showing 12 changed files with 380 additions and 178 deletions.
96 changes: 67 additions & 29 deletions core/lib/canary/index/trieve/client.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ defmodule Canary.Index.Trieve.Client do
dataset = Application.fetch_env!(:canary, :trieve_dataset)

Canary.rest_client(
receive_timeout: 2_000,
base_url: "https://api.trieve.ai/api",
headers: [
{"Content-Type", "application/json"},
Expand Down Expand Up @@ -62,19 +61,49 @@ defmodule Canary.Index.Trieve.Client do
tags: tags
} = chunk

tag_set =
if is_nil(tags) or tags == [] do
[
format_for_tagset(:source_id, source_id),
format_for_tagset(:empty_tags)
]
else
[
format_for_tagset(:source_id, source_id)
| Enum.map(tags, &format_for_tagset(:tag, &1))
]
end

%{
tracking_id: tracking_id,
group_tracking_ids: [group_tracking_id],
link: url,
chunk_html: content,
metadata: meta,
tag_set: [
format_for_tag(:source_id, source_id)
| Enum.map(tags, &format_for_tag(:tag, &1))
],
tag_set: tag_set,
convert_html_to_text: false,
upsert_by_tracking_id: true
}
|> then(fn data ->
if is_struct(chunk[:created_at], DateTime) do
data
|> Map.merge(%{
time_stamp: DateTime.to_iso8601(chunk.created_at)
})
else
data
end
end)
|> then(fn data ->
if is_binary(chunk[:title]) do
data
|> Map.merge(%{
fulltext_boost: %{boost_factor: 10, phrase: chunk.title}
})
else
data
end
end)
end)

# https://docs.trieve.ai/api-reference/chunk/create-or-upsert-chunk-or-chunks
Expand Down Expand Up @@ -108,53 +137,61 @@ defmodule Canary.Index.Trieve.Client do
def search(query, opts \\ []) do
tags = opts[:tags]
source_ids = Keyword.fetch!(opts, :source_ids)
search_type = if(question?(query), do: :fulltext, else: :hybrid)
search_type = if(question?(query), do: :hybrid, else: :fulltext)
receive_timeout = if(question?(query), do: 3_000, else: 1_500)

highlight_options =
if question?(query) do
%{
highlight_window: 1,
highlight_max_length: 4,
highlight_window: 8,
highlight_max_length: 5,
highlight_threshold: 0.5,
highlight_strategy: :v1
}
else
%{
highlight_window: 1,
highlight_window: 8,
highlight_max_length: 2,
highlight_threshold: 0.9,
highlight_strategy: :exactmatch
}
end

filters = %{
must:
[
%{
field: "tag_set",
match_any: Enum.map(source_ids, &format_for_tagset(:source_id, &1))
},
if(not is_nil(tags) and tags != [],
do: %{
field: "tag_set",
match_any: [
format_for_tagset(:empty_tags)
| Enum.map(tags, &format_for_tagset(:tag, &1))
]
},
else: nil
)
]
|> Enum.reject(&is_nil/1)
}

# https://docs.trieve.ai/api-reference/chunk-group/search-over-groups
case base()
|> Req.post(
receive_timeout: receive_timeout,
url: "/chunk_group/group_oriented_search",
json: %{
query: query,
filters: %{
must:
[
%{
field: "tag_set",
match_any: Enum.map(source_ids, &format_for_tag(:source_id, &1))
},
if(not is_nil(tags) and tags != [],
do: %{
field: "tag_set",
match_any: Enum.map(tags, &format_for_tag(:tag, &1))
},
else: nil
)
]
|> Enum.reject(&is_nil/1)
},
filters: filters,
page: 1,
page_size: 8,
group_size: 3,
search_type: search_type,
score_threshold: 0.1,
recency_bias: 0.5,
remove_stop_words: true,
slim_chunks: false,
typo_options: %{correct_typos: true},
Expand All @@ -178,9 +215,10 @@ defmodule Canary.Index.Trieve.Client do
query
|> String.split(" ")
|> Enum.reject(&(&1 == ""))
|> Enum.count() > 2
|> Enum.count() > 3
end

defp format_for_tag(:source_id, value), do: "source_id:#{value}"
defp format_for_tag(:tag, value), do: "tag:#{value}"
defp format_for_tagset(:empty_tags), do: "__empty_tags__"
defp format_for_tagset(:source_id, value), do: "__source_id:#{value}__"
defp format_for_tagset(:tag, value), do: "__tag:#{value}__"
end
179 changes: 125 additions & 54 deletions core/lib/canary/searcher.ex
Original file line number Diff line number Diff line change
@@ -1,37 +1,35 @@
defmodule Canary.Searcher do
@callback run(list(any()), String.t(), keyword()) :: {:ok, list(map())} | {:error, any()}
@callback run(String.t(), keyword()) :: {:ok, list(map())} | {:error, any()}

def run(sources, query, opts \\ []) do
def run(query, opts \\ []) do
{cache, opts} = Keyword.pop(opts, :cache, false)

if cache do
with {:error, _} <- get_cache(sources, query, opts),
{:ok, result} <- impl().run(sources, query, opts) do
set_cache(sources, query, opts, result)
with {:error, _} <- get_cache(query, opts),
{:ok, result} <- impl().run(query, opts) do
set_cache(query, opts, result)
{:ok, result}
end
else
impl().run(sources, query, opts)
impl().run(query, opts)
end
end

defp set_cache(sources, query, opts, result) do
Cachex.put(:cache, key(sources, query, opts), result, ttl: :timer.minutes(3))
defp set_cache(query, opts, result) do
Cachex.put(:cache, key(query, opts), result, ttl: :timer.minutes(3))
end

defp get_cache(sources, query, opts) do
case Cachex.get(:cache, key(sources, query, opts)) do
defp get_cache(query, opts) do
case Cachex.get(:cache, key(query, opts)) do
{:ok, nil} -> {:error, :not_found}
{:ok, hit} -> {:ok, hit}
end
end

defp key(sources, query, opts) do
sources
|> Enum.map(& &1.id)
|> Enum.join(",")
|> Kernel.<>(":" <> query)
defp key(query, opts) do
query
|> Kernel.<>(":" <> Jason.encode!(opts[:tags]))
|> Kernel.<>(":" <> Jason.encode!(opts[:source_ids]))
end

defp impl(), do: Application.get_env(:canary, :searcher, Canary.Searcher.Default)
Expand All @@ -42,53 +40,126 @@ defmodule Canary.Searcher.Default do

require Ash.Query

def run(sources, query, _opts) do
{:ok, groups} =
Canary.Index.Trieve.Client.search(query, source_ids: Enum.map(sources, & &1.id))
def run(query, opts) do
with {:ok, groups} <- Canary.Index.Trieve.Client.search(query, opts) do
matches =
groups
|> Enum.map(&transform_result/1)
|> Enum.reject(&is_nil/1)

matches =
groups
|> Enum.map(fn %{"group" => group, "chunks" => chunks} ->
chunks =
chunks
|> Enum.map(fn chunk ->
{:ok, matches}
end
end

defp transform_result(%{
"group" => %{"metadata" => %{"type" => "webpage"} = group_meta},
"chunks" => chunks
}) do
chunks =
chunks
|> Enum.map(fn chunk ->
%{
"chunk" => %{"metadata" => meta, "link" => url},
"highlights" => highlights
} = chunk

cond do
meta["title"] == group_meta["title"] ->
nil

Enum.at(highlights, 0, nil) == nil ->
nil

true ->
%{
"chunk" => %{"metadata" => meta, "link" => url},
"highlights" => highlights
} = chunk

cond do
meta["title"] == group["metadata"]["title"] ->
nil

Enum.at(highlights, 0, nil) == nil ->
nil

true ->
%{
meta: meta,
url: url,
title: meta["title"],
excerpt: Enum.at(highlights, 0)
}
end
end)
|> Enum.reject(&is_nil/1)

if chunks == [] do
nil
meta: meta,
url: url,
title: meta["title"],
excerpt: Enum.at(highlights, 0)
}
end
end)
|> Enum.reject(&is_nil/1)

if chunks == [] do
nil
else
%{
type: group_meta["type"],
url: group_meta["url"],
title: group_meta["title"],
meta: %{},
sub_results: chunks
}
end
end

defp transform_result(%{
"group" => %{"metadata" => %{"type" => "github_issue"} = group_meta},
"chunks" => chunks
}) do
chunks =
chunks
|> Enum.map(fn chunk ->
%{
"chunk" => %{"metadata" => _meta, "link" => url},
"highlights" => highlights
} = chunk

if Enum.at(highlights, 0) do
%{
url: url,
excerpt: Enum.at(highlights, 0)
}
else
nil
end
end)
|> Enum.reject(&is_nil/1)

%{
type: group_meta["type"],
url: group_meta["url"],
title: group_meta["title"],
meta: %{
closed: group_meta["closed"]
},
sub_results: chunks
}
end

defp transform_result(%{
"group" => %{"metadata" => %{"type" => "github_discussion"} = group_meta},
"chunks" => chunks
}) do
chunks =
chunks
|> Enum.map(fn chunk ->
%{
"chunk" => %{"metadata" => _meta, "link" => url},
"highlights" => highlights
} = chunk

if Enum.at(highlights, 0) do
%{
type: group["metadata"]["type"],
url: group["metadata"]["url"],
title: group["metadata"]["title"],
meta: %{},
sub_results: chunks
url: url,
excerpt: Enum.at(highlights, 0)
}
else
nil
end
end)
|> Enum.reject(&is_nil/1)

{:ok, matches}
%{
type: group_meta["type"],
url: group_meta["url"],
title: group_meta["title"],
meta: %{
closed: group_meta["closed"],
answered: group_meta["answered"]
},
sub_results: chunks
}
end
end
Loading

0 comments on commit c91c109

Please sign in to comment.