Skip to content

Commit

Permalink
feat:Retrieve chunkSize and worker from the configmap.
Browse files Browse the repository at this point in the history
  • Loading branch information
wangxinbiao committed Mar 14, 2024
1 parent 6ca975b commit 0ce9682
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 6 deletions.
9 changes: 7 additions & 2 deletions pypi/data-processing/src/common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,16 @@ def __set_property_value(self):

self.llm_qa_retry_count = int(llm_qa_retry_count)

# knowledge
# dataprocess
dataprocess = model_cr.get_dataprocess_in_k8s_configmap(
namespace=k8s_pod_namespace, config_map_name=k8s_default_config
)
# chunk size
self.knowledge_chunk_size = 500
self.knowledge_chunk_size = dataprocess.get("chunkSize", 500)
# chunk overlap
self.knowledge_chunk_overlap = 50
# worker
self.worker = dataprocess.get("worker", 1)

# backend PostgreSQL
postgresql_config = postgresql_cr.get_postgresql_config_in_k8s_configmap(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,15 @@ def __init__(
self,
separator: str = "\n\n",
pipeline: str = "zh_core_web_sm",
chunk_size: int = 500,
chunk_overlap: int = 10,
chunk_size: int = None,
chunk_overlap: int = None,
):
"""Initialize the spacy text splitter."""
if chunk_size is None:
chunk_size = config.knowledge_chunk_size
if chunk_overlap is None:
chunk_overlap = config.knowledge_chunk_overlap

if chunk_overlap > chunk_size:
raise ValueError(
f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
Expand Down
32 changes: 31 additions & 1 deletion pypi/data-processing/src/kube/model_cr.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,4 +124,34 @@ def get_spec_for_embedding_k8s_cr(name, namespace):
return {"status": 200, "message": "获取embedding中的provider成功", "data": provider}
except Exception as ex:
logger.error(str(ex))
return {"status": 400, "message": "获取embedding中的provider失败", "data": ""}
return {"status": 400, "message": "获取embedding中的provider失败", "data": ""}

def get_dataprocess_in_k8s_configmap(namespace, config_map_name):
"""Get the dataprocess in the configmap.
namespace: namespace;
config_map_name: config map name
"""
try:
kube = client.KubeEnv()

config_map = kube.read_namespaced_config_map(
namespace=namespace, name=config_map_name
)

config = config_map.data.get("dataprocess")

json_data = yaml.safe_load(config)

return json_data
except Exception as ex:
logger.error(
"".join(
[
f"Can not the dataprocess. The error is: \n",
f"{traceback.format_exc()}\n",
]
)
)

return None
2 changes: 1 addition & 1 deletion pypi/data-processing/src/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,4 @@ def _create_database_connection():


if __name__ == "__main__":
sanic_app.run(host="0.0.0.0", port=28888, access_log=False, debug=False, workers=2)
sanic_app.run(host="0.0.0.0", port=28888, access_log=False, debug=False, workers=config.worker)

0 comments on commit 0ce9682

Please sign in to comment.