diff --git a/build/web_server/Dockerfile b/build/web_server/Dockerfile index 2224e0b9da..9eb1fa0b63 100644 --- a/build/web_server/Dockerfile +++ b/build/web_server/Dockerfile @@ -15,7 +15,7 @@ FROM nikolaik/python-nodejs:python3.11-nodejs18-slim as base -RUN apt-get update && apt-get -y upgrade +RUN apt-get update && apt-get -y upgrade && apt-get install -y git ARG ENV ENV ENV=${ENV} diff --git a/server/__init__.py b/server/__init__.py index 52517c2e74..e81713c155 100644 --- a/server/__init__.py +++ b/server/__init__.py @@ -45,6 +45,36 @@ DEFAULT_NL_ROOT = "http://127.0.0.1:6060" +def _get_api_key(env_keys=[], gcp_project='', gcp_path=''): + """Gets an api key first from the environment, then from GCP secrets. + + Args: + env_keys: A list of keys in the environment to try getting the api key with + gcp_project: The GCP project to use to get the api key from GCP secrets + gcp_path: The path to getting the api key from GCP secrets + + Returns: + API key if it exists + + TODO: use this method everywhere else in this file + """ + # Try to get the key from the environment + for k in env_keys: + if os.environ.get(k): + return os.environ.get(k) + + # Try to get the key from secrets + if gcp_project and gcp_path: + secret_client = secretmanager.SecretManagerServiceClient() + secret_name = secret_client.secret_version_path(gcp_project, gcp_path, + 'latest') + secret_response = secret_client.access_secret_version(name=secret_name) + return secret_response.payload.data.decode('UTF-8').replace('\n', '') + + # If key is not found, return an empty string + return '' + + def register_routes_base_dc(app): # apply the blueprints for all apps from server.routes.dev import html as dev_html @@ -132,6 +162,23 @@ def register_routes_sustainability(app): ) +def register_routes_datagemma(app, cfg): + # Install blueprint for DataGemma page + from server.routes.dev_datagemma import api as dev_datagemma_api + app.register_blueprint(dev_datagemma_api.bp) + from server.routes.dev_datagemma import html as dev_datagemma_html + app.register_blueprint(dev_datagemma_html.bp) + + # Set the gemini api key + app.config['GEMINI_API_KEY'] = _get_api_key(['GEMINI_API_KEY'], + cfg.SECRET_PROJECT, + 'gemini-api-key') + # Set the DC NL api key + app.config['DC_NL_API_KEY'] = _get_api_key(['DC_NL_API_KEY'], + cfg.SECRET_PROJECT, + 'dc-nl-api-key') + + def register_routes_common(app): # apply blueprints for main app from server.routes import static @@ -282,6 +329,9 @@ def create_app(nl_root=DEFAULT_NL_ROOT): if cfg.SHOW_SUSTAINABILITY: register_routes_sustainability(app) + if cfg.ENABLE_DATAGEMMA: + register_routes_datagemma(app, cfg) + # Load topic page config topic_page_configs = libutil.get_topic_page_config() app.config['TOPIC_PAGE_CONFIG'] = topic_page_configs diff --git a/server/app_env/_base.py b/server/app_env/_base.py index a3f3b18d69..56e6da6ee5 100644 --- a/server/app_env/_base.py +++ b/server/app_env/_base.py @@ -98,3 +98,6 @@ class Config: # Whether to enable BigQuery for instance. This is primarily used for # accessing the observation browser pages. ENABLE_BQ = False + # Whether to enable the DataGemma UI for this instance. This UI should only be + # enabled for internal instances. + ENABLE_DATAGEMMA = False diff --git a/server/app_env/autopush.py b/server/app_env/autopush.py index ac25f61ca2..418a7cd370 100644 --- a/server/app_env/autopush.py +++ b/server/app_env/autopush.py @@ -25,3 +25,4 @@ class Config(_base.Config): HIDE_DEBUG = False USE_MEMCACHE = False ENABLE_BQ = True + ENABLE_DATAGEMMA = True diff --git a/server/app_env/local.py b/server/app_env/local.py index 0f2ca34e09..f1f0473814 100644 --- a/server/app_env/local.py +++ b/server/app_env/local.py @@ -21,6 +21,7 @@ class Config(_base.Config): SCHEME = 'http' USE_MEMCACHE = False ENABLE_BQ = True + ENABLE_DATAGEMMA = True class DCConfig(Config): diff --git a/server/requirements.txt b/server/requirements.txt index fbb9001536..aabab8b3c3 100644 --- a/server/requirements.txt +++ b/server/requirements.txt @@ -40,3 +40,5 @@ typing-extensions==4.10.0 webdriver-manager==4.0.0 Werkzeug==3.0.6 wheel==0.38.1 +# TODO: publish the library to pypi so we don't have to install from git +git+https://github.com/datacommonsorg/llm-tools.git diff --git a/server/routes/dev_datagemma/api.py b/server/routes/dev_datagemma/api.py new file mode 100644 index 0000000000..14856b98b7 --- /dev/null +++ b/server/routes/dev_datagemma/api.py @@ -0,0 +1,82 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Endpoints for DataGemma page""" + +import json + +from data_gemma import DataCommons +from data_gemma import GoogleAIStudio +from data_gemma import RAGFlow +from data_gemma import RIGFlow +from data_gemma import VertexAI +import flask +from flask import current_app +from flask import request +from flask import Response + +# Define blueprint +bp = flask.Blueprint('dev_datagemma_api', + __name__, + url_prefix='/api/dev/datagemma') + +_RIG_MODE = 'rig' +_RAG_MODE = 'rag' + +# TODO: consider moving these specifications to a config somewhere +_VERTEX_AI_RIG = VertexAI(project_id='datcom-website-dev', + location='us-central1', + prediction_endpoint_id='4999251772590522368') +_VERTEX_AI_RAG = VertexAI(project_id='datcom-website-dev', + location='us-central1', + prediction_endpoint_id='3459865124959944704') + + +def _get_datagemma_result(query, mode): + """Gets the results of running a datagemma flow on a query + + Args: + query: Query to run datagemma flow on + mode: mode to run the datagemma flow in + + Returns: + Results of running the datagemma flow. This is a FlowResponse as defined + here: https://github.com/datacommonsorg/llm-tools/blob/main/data_gemma/base.py#L116 + """ + dc_nl_service = DataCommons(api_key=current_app.config['DC_NL_API_KEY']) + result = None + if mode == _RIG_MODE: + result = RIGFlow(llm=_VERTEX_AI_RIG, + data_fetcher=dc_nl_service).query(query=query) + elif mode == _RAG_MODE: + gemini_model = GoogleAIStudio( + model='gemini-1.5-pro', api_keys=[current_app.config['GEMINI_API_KEY']]) + result = RAGFlow(llm_question=_VERTEX_AI_RAG, + llm_answer=gemini_model, + data_fetcher=dc_nl_service).query(query=query) + return result + + +@bp.route('/query') +def datagemma_query(): + query = request.args.get('query') + mode = request.args.get('mode') + if not query: + return 'error: must provide a query field', 400 + if not mode or mode not in [_RIG_MODE, _RAG_MODE]: + return f'error: must provide a mode field with values {_RIG_MODE} or {_RAG_MODE}', 400 + dg_result = _get_datagemma_result(query, mode) + result = {'answer': '', 'debug': ''} + if dg_result: + result = {'answer': dg_result.answer(), 'debug': dg_result.debug()} + return Response(json.dumps(result), 200, mimetype='application/json') diff --git a/server/routes/dev_datagemma/html.py b/server/routes/dev_datagemma/html.py new file mode 100644 index 0000000000..236d76e88d --- /dev/null +++ b/server/routes/dev_datagemma/html.py @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DataGemma page routes""" + +import flask + +# Define blueprint +bp = flask.Blueprint("dev-datagemma", __name__, url_prefix='/dev/datagemma') + + +@bp.route('/') +def dev_datagemma(): + return flask.render_template('dev/datagemma.html') \ No newline at end of file diff --git a/server/routes/redirects/redirects.json b/server/routes/redirects/redirects.json index dcf5529d9d..cf48286cea 100644 --- a/server/routes/redirects/redirects.json +++ b/server/routes/redirects/redirects.json @@ -13,5 +13,6 @@ "dc-coverage-aa2": "/tools/visualization#visType%3Dmap%26place%3DEarth%26placeType%3DCountry%26sv%3D%7B%22dcid%22%3A%22Count_Variable_AdministrativeArea2%22%7D", "video": "https://www.youtube.com/watch?v=O6iVsS-RDYI", "form": "https://forms.gle/C816TLMyp5fBdDt67", - "DataGemmaPaper": "https://arxiv.org/abs/2409.13741" + "DataGemmaPaper": "https://arxiv.org/abs/2409.13741", + "2pager": "https://docs.datacommons.org/DataCommons-2pager.pdf" } diff --git a/server/templates/dev/datagemma.html b/server/templates/dev/datagemma.html new file mode 100644 index 0000000000..293fd358ba --- /dev/null +++ b/server/templates/dev/datagemma.html @@ -0,0 +1,34 @@ +{# + Copyright 2025 Google LLC + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + #} + {%- extends BASE_HTML -%} + + {% set main_id = 'dev-datagemma' %} + {% set page_id = 'page-dev-datagemma' %} + {% set title = 'Datagemma' %} + {% set is_hide_header_search_bar = 'true' %} + + {% block head %} + + {% endblock %} + + {% block content %} +
+ {% endblock %} + + {% block footer %} + + {% endblock %} + \ No newline at end of file diff --git a/static/js/apps/datagemma/app.tsx b/static/js/apps/datagemma/app.tsx new file mode 100644 index 0000000000..11bf03cdd0 --- /dev/null +++ b/static/js/apps/datagemma/app.tsx @@ -0,0 +1,310 @@ +/** + * Copyright 2025 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** @jsxImportSource @emotion/react */ + +/** + * Main component for the datagemma page + */ + +import { css, ThemeProvider } from "@emotion/react"; +import axios from "axios"; +import _ from "lodash"; +import queryString from "query-string"; +import React, { ReactElement, useEffect, useState } from "react"; +import Collapsible from "react-collapsible"; +import ReactMarkdown from "react-markdown"; +import { Input } from "reactstrap"; +import rehypeRaw from "rehype-raw"; +import remarkGfm from "remark-gfm"; + +import theme from "../../theme/theme"; +import { stringifyFn } from "../../utils/axios"; +import { updateHash } from "../../utils/url_utils"; +import { processTableText } from "../eval_retrieval_generation/util"; + +// Constants for query modes +const RIG_MODE = "rig"; +const RAG_MODE = "rag"; +// Constants for URL hash parameters +const URL_HASH_PARAMS = { + query: "q", + mode: "m", +}; + +// Interface for the response received from DataGemmaAPI. +interface DataGemmaAPIResponse { + answer: string; + debug: string; +} + +// Interface for the displayed answer. +interface DisplayedAnswer { + answer: string; + footnotes: string; + debugInfo: string; +} + +function getSectionTrigger(title: string, opened: boolean): JSX.Element { + return ( +