diff --git a/build/web_server/Dockerfile b/build/web_server/Dockerfile index 2224e0b9da..9eb1fa0b63 100644 --- a/build/web_server/Dockerfile +++ b/build/web_server/Dockerfile @@ -15,7 +15,7 @@ FROM nikolaik/python-nodejs:python3.11-nodejs18-slim as base -RUN apt-get update && apt-get -y upgrade +RUN apt-get update && apt-get -y upgrade && apt-get install -y git ARG ENV ENV ENV=${ENV} diff --git a/server/__init__.py b/server/__init__.py index 52517c2e74..e81713c155 100644 --- a/server/__init__.py +++ b/server/__init__.py @@ -45,6 +45,36 @@ DEFAULT_NL_ROOT = "http://127.0.0.1:6060" +def _get_api_key(env_keys=[], gcp_project='', gcp_path=''): + """Gets an api key first from the environment, then from GCP secrets. + + Args: + env_keys: A list of keys in the environment to try getting the api key with + gcp_project: The GCP project to use to get the api key from GCP secrets + gcp_path: The path to getting the api key from GCP secrets + + Returns: + API key if it exists + + TODO: use this method everywhere else in this file + """ + # Try to get the key from the environment + for k in env_keys: + if os.environ.get(k): + return os.environ.get(k) + + # Try to get the key from secrets + if gcp_project and gcp_path: + secret_client = secretmanager.SecretManagerServiceClient() + secret_name = secret_client.secret_version_path(gcp_project, gcp_path, + 'latest') + secret_response = secret_client.access_secret_version(name=secret_name) + return secret_response.payload.data.decode('UTF-8').replace('\n', '') + + # If key is not found, return an empty string + return '' + + def register_routes_base_dc(app): # apply the blueprints for all apps from server.routes.dev import html as dev_html @@ -132,6 +162,23 @@ def register_routes_sustainability(app): ) +def register_routes_datagemma(app, cfg): + # Install blueprint for DataGemma page + from server.routes.dev_datagemma import api as dev_datagemma_api + app.register_blueprint(dev_datagemma_api.bp) + from server.routes.dev_datagemma import html as dev_datagemma_html + app.register_blueprint(dev_datagemma_html.bp) + + # Set the gemini api key + app.config['GEMINI_API_KEY'] = _get_api_key(['GEMINI_API_KEY'], + cfg.SECRET_PROJECT, + 'gemini-api-key') + # Set the DC NL api key + app.config['DC_NL_API_KEY'] = _get_api_key(['DC_NL_API_KEY'], + cfg.SECRET_PROJECT, + 'dc-nl-api-key') + + def register_routes_common(app): # apply blueprints for main app from server.routes import static @@ -282,6 +329,9 @@ def create_app(nl_root=DEFAULT_NL_ROOT): if cfg.SHOW_SUSTAINABILITY: register_routes_sustainability(app) + if cfg.ENABLE_DATAGEMMA: + register_routes_datagemma(app, cfg) + # Load topic page config topic_page_configs = libutil.get_topic_page_config() app.config['TOPIC_PAGE_CONFIG'] = topic_page_configs diff --git a/server/app_env/_base.py b/server/app_env/_base.py index a3f3b18d69..56e6da6ee5 100644 --- a/server/app_env/_base.py +++ b/server/app_env/_base.py @@ -98,3 +98,6 @@ class Config: # Whether to enable BigQuery for instance. This is primarily used for # accessing the observation browser pages. ENABLE_BQ = False + # Whether to enable the DataGemma UI for this instance. This UI should only be + # enabled for internal instances. + ENABLE_DATAGEMMA = False diff --git a/server/app_env/autopush.py b/server/app_env/autopush.py index ac25f61ca2..418a7cd370 100644 --- a/server/app_env/autopush.py +++ b/server/app_env/autopush.py @@ -25,3 +25,4 @@ class Config(_base.Config): HIDE_DEBUG = False USE_MEMCACHE = False ENABLE_BQ = True + ENABLE_DATAGEMMA = True diff --git a/server/app_env/local.py b/server/app_env/local.py index 0f2ca34e09..f1f0473814 100644 --- a/server/app_env/local.py +++ b/server/app_env/local.py @@ -21,6 +21,7 @@ class Config(_base.Config): SCHEME = 'http' USE_MEMCACHE = False ENABLE_BQ = True + ENABLE_DATAGEMMA = True class DCConfig(Config): diff --git a/server/requirements.txt b/server/requirements.txt index fbb9001536..aabab8b3c3 100644 --- a/server/requirements.txt +++ b/server/requirements.txt @@ -40,3 +40,5 @@ typing-extensions==4.10.0 webdriver-manager==4.0.0 Werkzeug==3.0.6 wheel==0.38.1 +# TODO: publish the library to pypi so we don't have to install from git +git+https://github.com/datacommonsorg/llm-tools.git diff --git a/server/routes/dev_datagemma/api.py b/server/routes/dev_datagemma/api.py new file mode 100644 index 0000000000..14856b98b7 --- /dev/null +++ b/server/routes/dev_datagemma/api.py @@ -0,0 +1,82 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Endpoints for DataGemma page""" + +import json + +from data_gemma import DataCommons +from data_gemma import GoogleAIStudio +from data_gemma import RAGFlow +from data_gemma import RIGFlow +from data_gemma import VertexAI +import flask +from flask import current_app +from flask import request +from flask import Response + +# Define blueprint +bp = flask.Blueprint('dev_datagemma_api', + __name__, + url_prefix='/api/dev/datagemma') + +_RIG_MODE = 'rig' +_RAG_MODE = 'rag' + +# TODO: consider moving these specifications to a config somewhere +_VERTEX_AI_RIG = VertexAI(project_id='datcom-website-dev', + location='us-central1', + prediction_endpoint_id='4999251772590522368') +_VERTEX_AI_RAG = VertexAI(project_id='datcom-website-dev', + location='us-central1', + prediction_endpoint_id='3459865124959944704') + + +def _get_datagemma_result(query, mode): + """Gets the results of running a datagemma flow on a query + + Args: + query: Query to run datagemma flow on + mode: mode to run the datagemma flow in + + Returns: + Results of running the datagemma flow. This is a FlowResponse as defined + here: https://github.com/datacommonsorg/llm-tools/blob/main/data_gemma/base.py#L116 + """ + dc_nl_service = DataCommons(api_key=current_app.config['DC_NL_API_KEY']) + result = None + if mode == _RIG_MODE: + result = RIGFlow(llm=_VERTEX_AI_RIG, + data_fetcher=dc_nl_service).query(query=query) + elif mode == _RAG_MODE: + gemini_model = GoogleAIStudio( + model='gemini-1.5-pro', api_keys=[current_app.config['GEMINI_API_KEY']]) + result = RAGFlow(llm_question=_VERTEX_AI_RAG, + llm_answer=gemini_model, + data_fetcher=dc_nl_service).query(query=query) + return result + + +@bp.route('/query') +def datagemma_query(): + query = request.args.get('query') + mode = request.args.get('mode') + if not query: + return 'error: must provide a query field', 400 + if not mode or mode not in [_RIG_MODE, _RAG_MODE]: + return f'error: must provide a mode field with values {_RIG_MODE} or {_RAG_MODE}', 400 + dg_result = _get_datagemma_result(query, mode) + result = {'answer': '', 'debug': ''} + if dg_result: + result = {'answer': dg_result.answer(), 'debug': dg_result.debug()} + return Response(json.dumps(result), 200, mimetype='application/json') diff --git a/server/routes/dev_datagemma/html.py b/server/routes/dev_datagemma/html.py new file mode 100644 index 0000000000..236d76e88d --- /dev/null +++ b/server/routes/dev_datagemma/html.py @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DataGemma page routes""" + +import flask + +# Define blueprint +bp = flask.Blueprint("dev-datagemma", __name__, url_prefix='/dev/datagemma') + + +@bp.route('/') +def dev_datagemma(): + return flask.render_template('dev/datagemma.html') \ No newline at end of file diff --git a/server/routes/redirects/redirects.json b/server/routes/redirects/redirects.json index dcf5529d9d..cf48286cea 100644 --- a/server/routes/redirects/redirects.json +++ b/server/routes/redirects/redirects.json @@ -13,5 +13,6 @@ "dc-coverage-aa2": "/tools/visualization#visType%3Dmap%26place%3DEarth%26placeType%3DCountry%26sv%3D%7B%22dcid%22%3A%22Count_Variable_AdministrativeArea2%22%7D", "video": "https://www.youtube.com/watch?v=O6iVsS-RDYI", "form": "https://forms.gle/C816TLMyp5fBdDt67", - "DataGemmaPaper": "https://arxiv.org/abs/2409.13741" + "DataGemmaPaper": "https://arxiv.org/abs/2409.13741", + "2pager": "https://docs.datacommons.org/DataCommons-2pager.pdf" } diff --git a/server/templates/dev/datagemma.html b/server/templates/dev/datagemma.html new file mode 100644 index 0000000000..293fd358ba --- /dev/null +++ b/server/templates/dev/datagemma.html @@ -0,0 +1,34 @@ +{# + Copyright 2025 Google LLC + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + #} + {%- extends BASE_HTML -%} + + {% set main_id = 'dev-datagemma' %} + {% set page_id = 'page-dev-datagemma' %} + {% set title = 'Datagemma' %} + {% set is_hide_header_search_bar = 'true' %} + + {% block head %} + + {% endblock %} + + {% block content %} +
+ {% endblock %} + + {% block footer %} + + {% endblock %} + \ No newline at end of file diff --git a/static/js/apps/datagemma/app.tsx b/static/js/apps/datagemma/app.tsx new file mode 100644 index 0000000000..11bf03cdd0 --- /dev/null +++ b/static/js/apps/datagemma/app.tsx @@ -0,0 +1,310 @@ +/** + * Copyright 2025 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** @jsxImportSource @emotion/react */ + +/** + * Main component for the datagemma page + */ + +import { css, ThemeProvider } from "@emotion/react"; +import axios from "axios"; +import _ from "lodash"; +import queryString from "query-string"; +import React, { ReactElement, useEffect, useState } from "react"; +import Collapsible from "react-collapsible"; +import ReactMarkdown from "react-markdown"; +import { Input } from "reactstrap"; +import rehypeRaw from "rehype-raw"; +import remarkGfm from "remark-gfm"; + +import theme from "../../theme/theme"; +import { stringifyFn } from "../../utils/axios"; +import { updateHash } from "../../utils/url_utils"; +import { processTableText } from "../eval_retrieval_generation/util"; + +// Constants for query modes +const RIG_MODE = "rig"; +const RAG_MODE = "rag"; +// Constants for URL hash parameters +const URL_HASH_PARAMS = { + query: "q", + mode: "m", +}; + +// Interface for the response received from DataGemmaAPI. +interface DataGemmaAPIResponse { + answer: string; + debug: string; +} + +// Interface for the displayed answer. +interface DisplayedAnswer { + answer: string; + footnotes: string; + debugInfo: string; +} + +function getSectionTrigger(title: string, opened: boolean): JSX.Element { + return ( +
+ + {opened ? "arrow_drop_down" : "arrow_right"} + + {title} +
+ ); +} + +/** + * Helper function to process a RIG query response into a displayable answer. + * Splits the response into the answer and footnotes sections. + * @param response The raw response string from the RIG query. + * @returns A DisplayedAnswer object containing the parsed answer and footnotes. + */ +function processRigResponse(response: DataGemmaAPIResponse): DisplayedAnswer { + const answer_parts = response.answer.split("#### FOOTNOTES ####\n"); + let footnotes = ""; + if (answer_parts.length > 1) { + footnotes = answer_parts[1]; + footnotes = footnotes.replaceAll("\n", "\n\n"); + } + return { answer: answer_parts[0], footnotes, debugInfo: response.debug }; +} + +/** + * Helper function to process a RAG query response into a displayable answer. + * Splits the response into the answer and footnotes sections. + * @param response The raw response string from the RAG query. + * @returns A DisplayedAnswer object containing the parsed answer and footnotes. + */ +function processRagResponse(response: DataGemmaAPIResponse): DisplayedAnswer { + const answer_parts = response.answer.split("#### TABLES ####\n"); + let footnotes = ""; + if (answer_parts.length > 1) { + const footnotes_part = answer_parts[1]; + const table_list = footnotes_part.split("Table"); + for (const t of table_list) { + const trimmed_t = t.trim(); + if (!trimmed_t) { + continue; + } + + const title = _.cloneDeep(trimmed_t).split("\n", 1)[0]; + const tableContent = trimmed_t.replace(title, ""); + const processedTable = processTableText(tableContent.trim()); + footnotes += "Table " + title + "\n" + processedTable + "\n\n"; + } + } + return { answer: answer_parts[0], footnotes, debugInfo: response.debug }; +} + +/** + * Application container + */ +export function App(): ReactElement { + const [query, setQuery] = useState(""); + const [answer, setAnswer] = useState(null); + const [mode, setMode] = useState(RIG_MODE); + const [showLoading, setShowLoading] = useState(false); + + /** + * useEffect hook to handle initial loading of information from the URL hash. + */ + useEffect(() => { + const hashParams = queryString.parse(window.location.hash); + const hashQuery = (hashParams[URL_HASH_PARAMS.query] || "") as string; + const hashMode = (hashParams[URL_HASH_PARAMS.mode] || "") as string; + if (hashQuery) { + setQuery(hashQuery); + } + if (hashMode) { + setMode(hashMode); + } + }, []); + + /** + * Function to execute the query when the "Run" button is clicked. + */ + function onQueryRun(): void { + setShowLoading(true); + updateHash({ [URL_HASH_PARAMS.query]: query }); + axios + .get("/api/dev/datagemma/query", { + params: { query, mode }, + paramsSerializer: stringifyFn, + }) + .then((resp) => { + if (mode === RIG_MODE) { + setAnswer(processRigResponse(resp.data)); + } else if (mode === RAG_MODE) { + setAnswer(processRagResponse(resp.data)); + } + }) + .catch(() => { + setAnswer({ + answer: "There was a problem running the query, please try again.", + footnotes: "", + debugInfo: "", + }); + }) + .finally(() => { + setShowLoading(false); + }); + } + + return ( + +
+
+ DataGemma Playground +
+
+
+ + setQuery(e.target.value)} + /> +
+
+ {[RIG_MODE, RAG_MODE].map((m) => { + return ( +
+ { + setMode(m); + updateHash({ [URL_HASH_PARAMS.mode]: m }); + }} + /> + +
+ ); + })} +
+
+ Run +
+
+
table, + th, + td { + border: solid; + } + `} + > + {showLoading &&
Loading ...
} + {!showLoading && answer && ( +
+
+ Answer +
+ + {answer.answer} + + {answer.footnotes && ( + + + {answer.footnotes} + + + )} + {answer.debugInfo && ( + + + {answer.debugInfo} + + + )} +
+ )} +
+
+
+ ); +} diff --git a/static/js/apps/datagemma/main.ts b/static/js/apps/datagemma/main.ts new file mode 100644 index 0000000000..28b92b406b --- /dev/null +++ b/static/js/apps/datagemma/main.ts @@ -0,0 +1,31 @@ +/** + * Copyright 2025 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Entrypoint for datagemma page. + */ + +import React from "react"; +import ReactDOM from "react-dom"; + +import { App } from "./app"; + +window.addEventListener("load", (): void => { + ReactDOM.render( + React.createElement(App), + document.getElementById("datagemma") + ); +}); diff --git a/static/js/components/nl_search_bar/auto_complete_input.tsx b/static/js/components/nl_search_bar/auto_complete_input.tsx index 92c030d36a..ae08590094 100644 --- a/static/js/components/nl_search_bar/auto_complete_input.tsx +++ b/static/js/components/nl_search_bar/auto_complete_input.tsx @@ -147,9 +147,16 @@ export function AutoCompleteInput( useEffect(() => { // TriggerSearch state used to ensure onSearch only called after text updated. - props.onSearch(); + executeQuery(); }, [triggerSearch, setTriggerSearch]); + function executeQuery(): void { + setResults({ placeResults: [], svResults: [] }); + setHoveredIdx(-1); + controller.current.abort(); // Ensure autocomplete responses can't come back. + props.onSearch(); + } + function onInputChange(e: React.ChangeEvent): void { const currentText = e.target.value; changeText(currentText); @@ -252,7 +259,7 @@ export function AutoCompleteInput( if (hoveredIdx >= 0) { selectResult(results.placeResults[hoveredIdx], hoveredIdx); } else { - props.onSearch(); + executeQuery(); } break; case "ArrowUp": @@ -343,7 +350,7 @@ export function AutoCompleteInput( autoComplete="one-time-code" autoFocus={props.shouldAutoFocus} > -
+
{isHeaderBar && }
diff --git a/static/webpack.config.js b/static/webpack.config.js index 418df64774..64d4453d94 100644 --- a/static/webpack.config.js +++ b/static/webpack.config.js @@ -37,6 +37,7 @@ const config = { __dirname + "/css/tools/stat_var.scss", ], dev: [__dirname + "/js/dev.ts", __dirname + "/css/dev.scss"], + datagemma: [__dirname + "/js/apps/datagemma/main.ts"], diff: [__dirname + "/js/apps/diff/main.ts", __dirname + "/css/diff.scss"], timeline: [ __dirname + "/js/tools/timeline/timeline.ts",