diff --git a/apps/application/serializers/application_serializers.py b/apps/application/serializers/application_serializers.py index b6479657a2b..a6f2ea29a39 100644 --- a/apps/application/serializers/application_serializers.py +++ b/apps/application/serializers/application_serializers.py @@ -209,15 +209,16 @@ def auth(self, request, with_valid=True): access_token = self.data.get("access_token") application_access_token = QuerySet(ApplicationAccessToken).filter(access_token=access_token).first() if application_access_token is not None and application_access_token.is_active: - if token is None or (token_details is not None and 'client_id' not in token_details) or ( - token_details is not None and token_details.get( - 'access_token') != application_access_token.access_token): + if token_details is not None and 'client_id' in token_details and token_details.get( + 'client_id') is not None: + client_id = token_details.get('client_id') + else: client_id = str(uuid.uuid1()) - token = signing.dumps({'application_id': str(application_access_token.application_id), - 'user_id': str(application_access_token.application.user.id), - 'access_token': application_access_token.access_token, - 'type': AuthenticationType.APPLICATION_ACCESS_TOKEN.value, - 'client_id': client_id}) + token = signing.dumps({'application_id': str(application_access_token.application_id), + 'user_id': str(application_access_token.application.user.id), + 'access_token': application_access_token.access_token, + 'type': AuthenticationType.APPLICATION_ACCESS_TOKEN.value, + 'client_id': client_id}) return token else: raise NotFound404(404, "无效的access_token") diff --git a/apps/application/sql/export_application_chat.sql b/apps/application/sql/export_application_chat.sql new file mode 100644 index 00000000000..dc580847e21 --- /dev/null +++ b/apps/application/sql/export_application_chat.sql @@ -0,0 +1,37 @@ +SELECT + application_chat."id" as chat_id, + application_chat.abstract as abstract, + application_chat_record_temp.problem_text as problem_text, + application_chat_record_temp.answer_text as answer_text, + application_chat_record_temp.message_tokens as message_tokens, + application_chat_record_temp.answer_tokens as answer_tokens, + application_chat_record_temp.run_time as run_time, + application_chat_record_temp.details::JSON as details, + application_chat_record_temp."index" as "index", + application_chat_record_temp.improve_paragraph_list as improve_paragraph_list, + application_chat_record_temp.vote_status as vote_status, + application_chat_record_temp.create_time as create_time +FROM + application_chat application_chat + LEFT JOIN ( + SELECT COUNT + ( "id" ) AS chat_record_count, + SUM ( CASE WHEN "vote_status" = '0' THEN 1 ELSE 0 END ) AS star_num, + SUM ( CASE WHEN "vote_status" = '1' THEN 1 ELSE 0 END ) AS trample_num, + SUM ( CASE WHEN array_length( application_chat_record.improve_paragraph_id_list, 1 ) IS NULL THEN 0 ELSE array_length( application_chat_record.improve_paragraph_id_list, 1 ) END ) AS mark_sum, + chat_id + FROM + application_chat_record + GROUP BY + application_chat_record.chat_id + ) chat_record_temp ON application_chat."id" = chat_record_temp.chat_id + LEFT JOIN ( + SELECT + *, + CASE + WHEN array_length( application_chat_record.improve_paragraph_id_list, 1 ) IS NULL THEN + '{}' ELSE ( SELECT ARRAY_AGG ( row_to_json ( paragraph ) ) FROM paragraph WHERE "id" = ANY ( application_chat_record.improve_paragraph_id_list ) ) + END as improve_paragraph_list + FROM + application_chat_record application_chat_record + ) application_chat_record_temp ON application_chat_record_temp.chat_id = application_chat."id" \ No newline at end of file diff --git a/apps/common/handle/impl/text_split_handle.py b/apps/common/handle/impl/text_split_handle.py index 67f56c37d30..176c1d6090c 100644 --- a/apps/common/handle/impl/text_split_handle.py +++ b/apps/common/handle/impl/text_split_handle.py @@ -9,7 +9,7 @@ import re from typing import List -import chardet +from charset_normalizer import detect from common.handle.base_split_handle import BaseSplitHandle from common.util.split_model import SplitModel @@ -26,7 +26,7 @@ def support(self, file, get_buffer): file_name: str = file.name.lower() if file_name.endswith(".md") or file_name.endswith('.txt'): return True - result = chardet.detect(buffer) + result = detect(buffer) if result['encoding'] != 'ascii' and result['confidence'] > 0.5: return True return False @@ -38,7 +38,7 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu else: split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit) try: - content = buffer.decode(chardet.detect(buffer)['encoding']) + content = buffer.decode(detect(buffer)['encoding']) except BaseException as e: return {'name': file.name, 'content': []} diff --git a/apps/common/util/fork.py b/apps/common/util/fork.py index eba10bbb184..7106bab96ec 100644 --- a/apps/common/util/fork.py +++ b/apps/common/util/fork.py @@ -4,9 +4,8 @@ import traceback from functools import reduce from typing import List, Set -from urllib.parse import urljoin, urlparse, ParseResult, urlsplit +from urllib.parse import urljoin, urlparse, ParseResult, urlsplit, urlunparse -import chardet import html2text as ht import requests from bs4 import BeautifulSoup @@ -44,6 +43,13 @@ def fork_child(child_link: ChildLink, selector_list: List[str], level: int, excl ForkManage.fork_child(child_link, selector_list, level - 1, exclude_link_url, fork_handler) +def remove_fragment(url: str) -> str: + parsed_url = urlparse(url) + modified_url = ParseResult(scheme=parsed_url.scheme, netloc=parsed_url.netloc, path=parsed_url.path, + params=parsed_url.params, query=parsed_url.query, fragment=None) + return urlunparse(modified_url) + + class Fork: class Response: def __init__(self, content: str, child_link_list: List[ChildLink], status, message: str): @@ -61,6 +67,7 @@ def error(message: str): return Fork.Response('', [], 500, message) def __init__(self, base_fork_url: str, selector_list: List[str]): + base_fork_url = remove_fragment(base_fork_url) self.base_fork_url = urljoin(base_fork_url if base_fork_url.endswith("/") else base_fork_url + '/', '.') parsed = urlsplit(base_fork_url) query = parsed.query @@ -74,9 +81,11 @@ def __init__(self, base_fork_url: str, selector_list: List[str]): fragment='').geturl() def get_child_link_list(self, bf: BeautifulSoup): - pattern = "^((?!(http:|https:|tel:/|#|mailto:|javascript:))|" + self.base_fork_url + ").*" + pattern = "^((?!(http:|https:|tel:/|#|mailto:|javascript:))|" + self.base_fork_url + "|/).*" link_list = bf.find_all(name='a', href=re.compile(pattern)) - result = [ChildLink(link.get('href'), link) for link in link_list] + result = [ChildLink(link.get('href'), link) if link.get('href').startswith(self.base_url) else ChildLink( + self.base_url + link.get('href'), link) for link in link_list] + result = [row for row in result if row.url.startswith(self.base_fork_url)] return result def get_content_html(self, bf: BeautifulSoup): @@ -122,9 +131,18 @@ def reset_beautiful_soup(self, bf: BeautifulSoup): @staticmethod def get_beautiful_soup(response): - encoding = response.encoding if response.encoding and response.encoding != 'ISO-8859-1' is not None else response.apparent_encoding + encoding = response.encoding if response.encoding is not None and response.encoding != 'ISO-8859-1' else response.apparent_encoding html_content = response.content.decode(encoding) - return BeautifulSoup(html_content, "html.parser") + beautiful_soup = BeautifulSoup(html_content, "html.parser") + meta_list = beautiful_soup.find_all('meta') + charset_list = [meta.attrs.get('charset') for meta in meta_list if + meta.attrs is not None and 'charset' in meta.attrs] + if len(charset_list) > 0: + charset = charset_list[0] + if charset != encoding: + html_content = response.content.decode(charset) + return BeautifulSoup(html_content, "html.parser") + return beautiful_soup def fork(self): try: diff --git a/apps/dataset/serializers/document_serializers.py b/apps/dataset/serializers/document_serializers.py index 8e062d8b4d7..2e469dfe9d7 100644 --- a/apps/dataset/serializers/document_serializers.py +++ b/apps/dataset/serializers/document_serializers.py @@ -30,12 +30,11 @@ from common.util.field_message import ErrMessage from common.util.file_util import get_file_content from common.util.fork import Fork -from common.util.split_model import SplitModel, get_split_model +from common.util.split_model import get_split_model from dataset.models.data_set import DataSet, Document, Paragraph, Problem, Type, Status, ProblemParagraphMapping from dataset.serializers.common_serializers import BatchSerializer, MetaSerializer from dataset.serializers.paragraph_serializers import ParagraphSerializers, ParagraphInstanceSerializer from smartdoc.conf import PROJECT_DIR -import chardet class DocumentEditInstanceSerializer(ApiMixin, serializers.Serializer): diff --git a/pyproject.toml b/pyproject.toml index 3ca6513ec83..cbe0e5dc2a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ jieba = "^0.42.1" diskcache = "^5.6.3" pillow = "^10.2.0" filetype = "^1.2.0" -chardet = "^5.2.0" torch = "^2.2.1" sentence-transformers = "^2.2.2" blinker = "^1.6.3" @@ -30,7 +29,6 @@ html2text = "^2024.2.26" langchain-openai = "^0.0.8" django-ipware = "^6.0.4" django-apscheduler = "^0.6.2" -chardet2 = "^2.0.3" pymupdf = "^1.24.0" python-docx = "^1.1.0" xlwt = "^1.3.0" diff --git a/ui/src/api/log.ts b/ui/src/api/log.ts index c610af2c45d..5135ccb853d 100644 --- a/ui/src/api/log.ts +++ b/ui/src/api/log.ts @@ -38,6 +38,15 @@ const exportChatLog: ( exportExcel(applicantion_name, `${prefix}/${applicaiton_id}/chat/export`, param, loading) } +const exportChatLog: ( + applicaiton_id: string, + applicantion_name: string, + param: any, + loading?: Ref +) => void = (applicaiton_id, applicantion_name, param, loading) => { + exportExcel(applicantion_name, `${prefix}/${applicaiton_id}/chat/export`, param, loading) +} + /** * 删除日志 * @param 参数 applicaiton_id, chat_id, diff --git a/ui/src/components/app-charts/components/LineCharts.vue b/ui/src/components/app-charts/components/LineCharts.vue index 5c236e2cbb7..1f37db54e90 100644 --- a/ui/src/components/app-charts/components/LineCharts.vue +++ b/ui/src/components/app-charts/components/LineCharts.vue @@ -4,6 +4,7 @@