Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🌎 Pass source metadata through content refresher agent #1185

Merged
merged 7 commits into from
Aug 4, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -35,21 +35,21 @@ async def run(self, workflow_id: str, **kwargs: Any) -> ContentRefresherOutput:
keywords = await find_content_kws(target_content)
logger.info(keywords)

source_urls = search_results(keywords)
if target_url in source_urls: # TODO: check based on content overlap
source_urls.remove(target_url)
logger.info(source_urls)
sources = search_results(keywords)
sources = [
source for source in sources if source["url"] != target_url
] # TODO: check based on content overlap
logger.info(sources)

source_contents = [
await get_page_content(url)
for url in source_urls[:3] # TODO: remove limit of 3 sources
]
for source in sources[:3]: # TODO: remove limit of 3 sources
source["content"] = await get_page_content(source["url"])
logger.info(sources)

source_contents = [
content for content in source_contents if content is not None
source for source in sources if source.get("content", None) is not None
]

logger.info(source_contents)

new_info = [
await find_new_info(target_content, source_content)
for source_content in source_contents
Expand Down Expand Up @@ -134,7 +134,7 @@ async def find_content_kws(content: str) -> str:
)


def search_results(search_query: str) -> list[str]:
def search_results(search_query: str) -> list[dict[str, str]]:
# use SERP API
response = requests.post(
f"https://google.serper.dev/search",
Expand All @@ -147,14 +147,26 @@ def search_results(search_query: str) -> list[str]:
},
)
response.raise_for_status()
urls = [result["link"] for result in response.json()["organic"]]
return urls

source_information = [
{
"url": result.get("link", None),
"title": result.get("title", None),
"date": result.get("date", None),
}
for result in response.json().get("organic", [])
]
return source_information


async def find_new_info(target: str, source: dict[str, str]) -> str:
source_metadata = f"{source['url']}, {source['title']}" + (
f", {source['date']}" if source["date"] else ""
)
source_content = source["content"]

async def find_new_info(target: str, source: str) -> str:
# Claude: info mentioned in source that is not mentioned in target
prompt = HumanAssistantPrompt(
human_prompt=f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article.",
human_prompt=f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source_content}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article.",
assistant_prompt="Here is a list of claims in the SOURCE that are not in the TARGET:",
)

Expand All @@ -164,13 +176,14 @@ async def find_new_info(target: str, source: str) -> str:
)

new_info = "\n".join(response.split("\n\n"))
new_info += "\n" + source_metadata
return new_info


async def add_info(target: str, info: str) -> str:
# Claude: rewrite target to include the info
prompt = HumanAssistantPrompt(
human_prompt=f"Below are notes from some SOURCE articles:\n{info}\n----------------\nBelow is the TARGET article:\n{target}\n----------------\nPlease rewrite the TARGET article to include the information from the SOURCE articles.",
human_prompt=f"Below are notes from some SOURCE articles:\n{info}\n----------------\nBelow is the TARGET article:\n{target}\n----------------\nPlease rewrite the TARGET article to include the information from the SOURCE articles. Maintain the format of the TARGET article. After any source info that you add, include inline citations using the following example format: 'So this is a cited sentence at the end of a paragraph[1](https://www.wisnerbaum.com/prescription-drugs/gardasil-lawsuit/, Gardasil Vaccine Lawsuit Update August 2023 - Wisner Baum).' Do not add citations for any info in the TARGET article. Do not list citations separately at the end of the response",
assistant_prompt="Here is a rewritten version of the target article that incorporates relevant information from the source articles:",
)

Expand Down