From bba1b91fd948ed7a4555c02aa7a86ed0111f3b46 Mon Sep 17 00:00:00 2001 From: Gabriel Mechali Date: Wed, 5 Feb 2025 16:36:13 -0800 Subject: [PATCH 1/3] [Autocomplete] Abort the autocomplete call if we execute the query (#4929) This is to fix a bug that Prem identified where the autocomplete response remained open, or would appear after the query execution because we don't close it before the execution, and don't abort calls still in progress. Verified the fix locally. --- .../nl_search_bar/auto_complete_input.tsx | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/static/js/components/nl_search_bar/auto_complete_input.tsx b/static/js/components/nl_search_bar/auto_complete_input.tsx index 92c030d36a..ae08590094 100644 --- a/static/js/components/nl_search_bar/auto_complete_input.tsx +++ b/static/js/components/nl_search_bar/auto_complete_input.tsx @@ -147,9 +147,16 @@ export function AutoCompleteInput( useEffect(() => { // TriggerSearch state used to ensure onSearch only called after text updated. - props.onSearch(); + executeQuery(); }, [triggerSearch, setTriggerSearch]); + function executeQuery(): void { + setResults({ placeResults: [], svResults: [] }); + setHoveredIdx(-1); + controller.current.abort(); // Ensure autocomplete responses can't come back. + props.onSearch(); + } + function onInputChange(e: React.ChangeEvent): void { const currentText = e.target.value; changeText(currentText); @@ -252,7 +259,7 @@ export function AutoCompleteInput( if (hoveredIdx >= 0) { selectResult(results.placeResults[hoveredIdx], hoveredIdx); } else { - props.onSearch(); + executeQuery(); } break; case "ArrowUp": @@ -343,7 +350,7 @@ export function AutoCompleteInput( autoComplete="one-time-code" autoFocus={props.shouldAutoFocus} > -
+
{isHeaderBar && }
From 6b6b657d08fa4646a952a0c41870ab51546c239f Mon Sep 17 00:00:00 2001 From: chejennifer <69875368+chejennifer@users.noreply.github.com> Date: Thu, 6 Feb 2025 08:20:13 -0800 Subject: [PATCH 2/3] Create a DataGemma UI (#4913) This creates a basic UI version of the DataGemma notebooks ([rag](https://colab.google.com/github/datacommonsorg/llm-tools/blob/master/notebooks/datagemma_rag.ipynb), [rig](https://colab.google.com/github/datacommonsorg/llm-tools/blob/master/notebooks/datagemma_rig.ipynb)), so that it's easier to play with the two approaches. This is only for internal use and will only be enabled in the local environment and autopush. ![image](https://github.com/user-attachments/assets/788f8de0-b40c-45b3-bc0e-8675db6c84cb) ![image](https://github.com/user-attachments/assets/e7e10ccd-d309-4b7f-a05e-1e0fda3957f9) --- build/web_server/Dockerfile | 2 +- server/__init__.py | 50 +++++ server/app_env/_base.py | 3 + server/app_env/autopush.py | 1 + server/app_env/local.py | 1 + server/requirements.txt | 2 + server/routes/dev_datagemma/api.py | 82 ++++++++ server/routes/dev_datagemma/html.py | 24 +++ server/templates/dev/datagemma.html | 34 +++ static/js/apps/datagemma/app.tsx | 310 ++++++++++++++++++++++++++++ static/js/apps/datagemma/main.ts | 31 +++ static/webpack.config.js | 1 + 12 files changed, 540 insertions(+), 1 deletion(-) create mode 100644 server/routes/dev_datagemma/api.py create mode 100644 server/routes/dev_datagemma/html.py create mode 100644 server/templates/dev/datagemma.html create mode 100644 static/js/apps/datagemma/app.tsx create mode 100644 static/js/apps/datagemma/main.ts diff --git a/build/web_server/Dockerfile b/build/web_server/Dockerfile index 2224e0b9da..9eb1fa0b63 100644 --- a/build/web_server/Dockerfile +++ b/build/web_server/Dockerfile @@ -15,7 +15,7 @@ FROM nikolaik/python-nodejs:python3.11-nodejs18-slim as base -RUN apt-get update && apt-get -y upgrade +RUN apt-get update && apt-get -y upgrade && apt-get install -y git ARG ENV ENV ENV=${ENV} diff --git a/server/__init__.py b/server/__init__.py index 52517c2e74..e81713c155 100644 --- a/server/__init__.py +++ b/server/__init__.py @@ -45,6 +45,36 @@ DEFAULT_NL_ROOT = "http://127.0.0.1:6060" +def _get_api_key(env_keys=[], gcp_project='', gcp_path=''): + """Gets an api key first from the environment, then from GCP secrets. + + Args: + env_keys: A list of keys in the environment to try getting the api key with + gcp_project: The GCP project to use to get the api key from GCP secrets + gcp_path: The path to getting the api key from GCP secrets + + Returns: + API key if it exists + + TODO: use this method everywhere else in this file + """ + # Try to get the key from the environment + for k in env_keys: + if os.environ.get(k): + return os.environ.get(k) + + # Try to get the key from secrets + if gcp_project and gcp_path: + secret_client = secretmanager.SecretManagerServiceClient() + secret_name = secret_client.secret_version_path(gcp_project, gcp_path, + 'latest') + secret_response = secret_client.access_secret_version(name=secret_name) + return secret_response.payload.data.decode('UTF-8').replace('\n', '') + + # If key is not found, return an empty string + return '' + + def register_routes_base_dc(app): # apply the blueprints for all apps from server.routes.dev import html as dev_html @@ -132,6 +162,23 @@ def register_routes_sustainability(app): ) +def register_routes_datagemma(app, cfg): + # Install blueprint for DataGemma page + from server.routes.dev_datagemma import api as dev_datagemma_api + app.register_blueprint(dev_datagemma_api.bp) + from server.routes.dev_datagemma import html as dev_datagemma_html + app.register_blueprint(dev_datagemma_html.bp) + + # Set the gemini api key + app.config['GEMINI_API_KEY'] = _get_api_key(['GEMINI_API_KEY'], + cfg.SECRET_PROJECT, + 'gemini-api-key') + # Set the DC NL api key + app.config['DC_NL_API_KEY'] = _get_api_key(['DC_NL_API_KEY'], + cfg.SECRET_PROJECT, + 'dc-nl-api-key') + + def register_routes_common(app): # apply blueprints for main app from server.routes import static @@ -282,6 +329,9 @@ def create_app(nl_root=DEFAULT_NL_ROOT): if cfg.SHOW_SUSTAINABILITY: register_routes_sustainability(app) + if cfg.ENABLE_DATAGEMMA: + register_routes_datagemma(app, cfg) + # Load topic page config topic_page_configs = libutil.get_topic_page_config() app.config['TOPIC_PAGE_CONFIG'] = topic_page_configs diff --git a/server/app_env/_base.py b/server/app_env/_base.py index a3f3b18d69..56e6da6ee5 100644 --- a/server/app_env/_base.py +++ b/server/app_env/_base.py @@ -98,3 +98,6 @@ class Config: # Whether to enable BigQuery for instance. This is primarily used for # accessing the observation browser pages. ENABLE_BQ = False + # Whether to enable the DataGemma UI for this instance. This UI should only be + # enabled for internal instances. + ENABLE_DATAGEMMA = False diff --git a/server/app_env/autopush.py b/server/app_env/autopush.py index ac25f61ca2..418a7cd370 100644 --- a/server/app_env/autopush.py +++ b/server/app_env/autopush.py @@ -25,3 +25,4 @@ class Config(_base.Config): HIDE_DEBUG = False USE_MEMCACHE = False ENABLE_BQ = True + ENABLE_DATAGEMMA = True diff --git a/server/app_env/local.py b/server/app_env/local.py index 0f2ca34e09..f1f0473814 100644 --- a/server/app_env/local.py +++ b/server/app_env/local.py @@ -21,6 +21,7 @@ class Config(_base.Config): SCHEME = 'http' USE_MEMCACHE = False ENABLE_BQ = True + ENABLE_DATAGEMMA = True class DCConfig(Config): diff --git a/server/requirements.txt b/server/requirements.txt index fbb9001536..aabab8b3c3 100644 --- a/server/requirements.txt +++ b/server/requirements.txt @@ -40,3 +40,5 @@ typing-extensions==4.10.0 webdriver-manager==4.0.0 Werkzeug==3.0.6 wheel==0.38.1 +# TODO: publish the library to pypi so we don't have to install from git +git+https://github.com/datacommonsorg/llm-tools.git diff --git a/server/routes/dev_datagemma/api.py b/server/routes/dev_datagemma/api.py new file mode 100644 index 0000000000..14856b98b7 --- /dev/null +++ b/server/routes/dev_datagemma/api.py @@ -0,0 +1,82 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Endpoints for DataGemma page""" + +import json + +from data_gemma import DataCommons +from data_gemma import GoogleAIStudio +from data_gemma import RAGFlow +from data_gemma import RIGFlow +from data_gemma import VertexAI +import flask +from flask import current_app +from flask import request +from flask import Response + +# Define blueprint +bp = flask.Blueprint('dev_datagemma_api', + __name__, + url_prefix='/api/dev/datagemma') + +_RIG_MODE = 'rig' +_RAG_MODE = 'rag' + +# TODO: consider moving these specifications to a config somewhere +_VERTEX_AI_RIG = VertexAI(project_id='datcom-website-dev', + location='us-central1', + prediction_endpoint_id='4999251772590522368') +_VERTEX_AI_RAG = VertexAI(project_id='datcom-website-dev', + location='us-central1', + prediction_endpoint_id='3459865124959944704') + + +def _get_datagemma_result(query, mode): + """Gets the results of running a datagemma flow on a query + + Args: + query: Query to run datagemma flow on + mode: mode to run the datagemma flow in + + Returns: + Results of running the datagemma flow. This is a FlowResponse as defined + here: https://github.com/datacommonsorg/llm-tools/blob/main/data_gemma/base.py#L116 + """ + dc_nl_service = DataCommons(api_key=current_app.config['DC_NL_API_KEY']) + result = None + if mode == _RIG_MODE: + result = RIGFlow(llm=_VERTEX_AI_RIG, + data_fetcher=dc_nl_service).query(query=query) + elif mode == _RAG_MODE: + gemini_model = GoogleAIStudio( + model='gemini-1.5-pro', api_keys=[current_app.config['GEMINI_API_KEY']]) + result = RAGFlow(llm_question=_VERTEX_AI_RAG, + llm_answer=gemini_model, + data_fetcher=dc_nl_service).query(query=query) + return result + + +@bp.route('/query') +def datagemma_query(): + query = request.args.get('query') + mode = request.args.get('mode') + if not query: + return 'error: must provide a query field', 400 + if not mode or mode not in [_RIG_MODE, _RAG_MODE]: + return f'error: must provide a mode field with values {_RIG_MODE} or {_RAG_MODE}', 400 + dg_result = _get_datagemma_result(query, mode) + result = {'answer': '', 'debug': ''} + if dg_result: + result = {'answer': dg_result.answer(), 'debug': dg_result.debug()} + return Response(json.dumps(result), 200, mimetype='application/json') diff --git a/server/routes/dev_datagemma/html.py b/server/routes/dev_datagemma/html.py new file mode 100644 index 0000000000..236d76e88d --- /dev/null +++ b/server/routes/dev_datagemma/html.py @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DataGemma page routes""" + +import flask + +# Define blueprint +bp = flask.Blueprint("dev-datagemma", __name__, url_prefix='/dev/datagemma') + + +@bp.route('/') +def dev_datagemma(): + return flask.render_template('dev/datagemma.html') \ No newline at end of file diff --git a/server/templates/dev/datagemma.html b/server/templates/dev/datagemma.html new file mode 100644 index 0000000000..293fd358ba --- /dev/null +++ b/server/templates/dev/datagemma.html @@ -0,0 +1,34 @@ +{# + Copyright 2025 Google LLC + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + #} + {%- extends BASE_HTML -%} + + {% set main_id = 'dev-datagemma' %} + {% set page_id = 'page-dev-datagemma' %} + {% set title = 'Datagemma' %} + {% set is_hide_header_search_bar = 'true' %} + + {% block head %} + + {% endblock %} + + {% block content %} +
+ {% endblock %} + + {% block footer %} + + {% endblock %} + \ No newline at end of file diff --git a/static/js/apps/datagemma/app.tsx b/static/js/apps/datagemma/app.tsx new file mode 100644 index 0000000000..11bf03cdd0 --- /dev/null +++ b/static/js/apps/datagemma/app.tsx @@ -0,0 +1,310 @@ +/** + * Copyright 2025 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** @jsxImportSource @emotion/react */ + +/** + * Main component for the datagemma page + */ + +import { css, ThemeProvider } from "@emotion/react"; +import axios from "axios"; +import _ from "lodash"; +import queryString from "query-string"; +import React, { ReactElement, useEffect, useState } from "react"; +import Collapsible from "react-collapsible"; +import ReactMarkdown from "react-markdown"; +import { Input } from "reactstrap"; +import rehypeRaw from "rehype-raw"; +import remarkGfm from "remark-gfm"; + +import theme from "../../theme/theme"; +import { stringifyFn } from "../../utils/axios"; +import { updateHash } from "../../utils/url_utils"; +import { processTableText } from "../eval_retrieval_generation/util"; + +// Constants for query modes +const RIG_MODE = "rig"; +const RAG_MODE = "rag"; +// Constants for URL hash parameters +const URL_HASH_PARAMS = { + query: "q", + mode: "m", +}; + +// Interface for the response received from DataGemmaAPI. +interface DataGemmaAPIResponse { + answer: string; + debug: string; +} + +// Interface for the displayed answer. +interface DisplayedAnswer { + answer: string; + footnotes: string; + debugInfo: string; +} + +function getSectionTrigger(title: string, opened: boolean): JSX.Element { + return ( +
+ + {opened ? "arrow_drop_down" : "arrow_right"} + + {title} +
+ ); +} + +/** + * Helper function to process a RIG query response into a displayable answer. + * Splits the response into the answer and footnotes sections. + * @param response The raw response string from the RIG query. + * @returns A DisplayedAnswer object containing the parsed answer and footnotes. + */ +function processRigResponse(response: DataGemmaAPIResponse): DisplayedAnswer { + const answer_parts = response.answer.split("#### FOOTNOTES ####\n"); + let footnotes = ""; + if (answer_parts.length > 1) { + footnotes = answer_parts[1]; + footnotes = footnotes.replaceAll("\n", "\n\n"); + } + return { answer: answer_parts[0], footnotes, debugInfo: response.debug }; +} + +/** + * Helper function to process a RAG query response into a displayable answer. + * Splits the response into the answer and footnotes sections. + * @param response The raw response string from the RAG query. + * @returns A DisplayedAnswer object containing the parsed answer and footnotes. + */ +function processRagResponse(response: DataGemmaAPIResponse): DisplayedAnswer { + const answer_parts = response.answer.split("#### TABLES ####\n"); + let footnotes = ""; + if (answer_parts.length > 1) { + const footnotes_part = answer_parts[1]; + const table_list = footnotes_part.split("Table"); + for (const t of table_list) { + const trimmed_t = t.trim(); + if (!trimmed_t) { + continue; + } + + const title = _.cloneDeep(trimmed_t).split("\n", 1)[0]; + const tableContent = trimmed_t.replace(title, ""); + const processedTable = processTableText(tableContent.trim()); + footnotes += "Table " + title + "\n" + processedTable + "\n\n"; + } + } + return { answer: answer_parts[0], footnotes, debugInfo: response.debug }; +} + +/** + * Application container + */ +export function App(): ReactElement { + const [query, setQuery] = useState(""); + const [answer, setAnswer] = useState(null); + const [mode, setMode] = useState(RIG_MODE); + const [showLoading, setShowLoading] = useState(false); + + /** + * useEffect hook to handle initial loading of information from the URL hash. + */ + useEffect(() => { + const hashParams = queryString.parse(window.location.hash); + const hashQuery = (hashParams[URL_HASH_PARAMS.query] || "") as string; + const hashMode = (hashParams[URL_HASH_PARAMS.mode] || "") as string; + if (hashQuery) { + setQuery(hashQuery); + } + if (hashMode) { + setMode(hashMode); + } + }, []); + + /** + * Function to execute the query when the "Run" button is clicked. + */ + function onQueryRun(): void { + setShowLoading(true); + updateHash({ [URL_HASH_PARAMS.query]: query }); + axios + .get("/api/dev/datagemma/query", { + params: { query, mode }, + paramsSerializer: stringifyFn, + }) + .then((resp) => { + if (mode === RIG_MODE) { + setAnswer(processRigResponse(resp.data)); + } else if (mode === RAG_MODE) { + setAnswer(processRagResponse(resp.data)); + } + }) + .catch(() => { + setAnswer({ + answer: "There was a problem running the query, please try again.", + footnotes: "", + debugInfo: "", + }); + }) + .finally(() => { + setShowLoading(false); + }); + } + + return ( + +
+
+ DataGemma Playground +
+
+
+ + setQuery(e.target.value)} + /> +
+
+ {[RIG_MODE, RAG_MODE].map((m) => { + return ( +
+ { + setMode(m); + updateHash({ [URL_HASH_PARAMS.mode]: m }); + }} + /> + +
+ ); + })} +
+
+ Run +
+
+
table, + th, + td { + border: solid; + } + `} + > + {showLoading &&
Loading ...
} + {!showLoading && answer && ( +
+
+ Answer +
+ + {answer.answer} + + {answer.footnotes && ( + + + {answer.footnotes} + + + )} + {answer.debugInfo && ( + + + {answer.debugInfo} + + + )} +
+ )} +
+
+
+ ); +} diff --git a/static/js/apps/datagemma/main.ts b/static/js/apps/datagemma/main.ts new file mode 100644 index 0000000000..28b92b406b --- /dev/null +++ b/static/js/apps/datagemma/main.ts @@ -0,0 +1,31 @@ +/** + * Copyright 2025 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Entrypoint for datagemma page. + */ + +import React from "react"; +import ReactDOM from "react-dom"; + +import { App } from "./app"; + +window.addEventListener("load", (): void => { + ReactDOM.render( + React.createElement(App), + document.getElementById("datagemma") + ); +}); diff --git a/static/webpack.config.js b/static/webpack.config.js index 418df64774..64d4453d94 100644 --- a/static/webpack.config.js +++ b/static/webpack.config.js @@ -37,6 +37,7 @@ const config = { __dirname + "/css/tools/stat_var.scss", ], dev: [__dirname + "/js/dev.ts", __dirname + "/css/dev.scss"], + datagemma: [__dirname + "/js/apps/datagemma/main.ts"], diff: [__dirname + "/js/apps/diff/main.ts", __dirname + "/css/diff.scss"], timeline: [ __dirname + "/js/tools/timeline/timeline.ts", From 2f9b9f79ab2cd60dbf3174be429513895151fd24 Mon Sep 17 00:00:00 2001 From: kmoscoe <165203920+kmoscoe@users.noreply.github.com> Date: Thu, 6 Feb 2025 10:24:45 -0800 Subject: [PATCH 3/3] Add link to 2-pager in docsite (#4933) --- server/routes/redirects/redirects.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/routes/redirects/redirects.json b/server/routes/redirects/redirects.json index dcf5529d9d..cf48286cea 100644 --- a/server/routes/redirects/redirects.json +++ b/server/routes/redirects/redirects.json @@ -13,5 +13,6 @@ "dc-coverage-aa2": "/tools/visualization#visType%3Dmap%26place%3DEarth%26placeType%3DCountry%26sv%3D%7B%22dcid%22%3A%22Count_Variable_AdministrativeArea2%22%7D", "video": "https://www.youtube.com/watch?v=O6iVsS-RDYI", "form": "https://forms.gle/C816TLMyp5fBdDt67", - "DataGemmaPaper": "https://arxiv.org/abs/2409.13741" + "DataGemmaPaper": "https://arxiv.org/abs/2409.13741", + "2pager": "https://docs.datacommons.org/DataCommons-2pager.pdf" }