Skip to content

Commit

Permalink
add xls file suport (#3321)
Browse files Browse the repository at this point in the history
  • Loading branch information
ic-xu authored Apr 12, 2024
1 parent 42936fc commit ad65c89
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 9 deletions.
9 changes: 7 additions & 2 deletions api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@

if not os.environ.get("DEBUG") or os.environ.get("DEBUG").lower() != 'true':
from gevent import monkey

monkey.patch_all()
# if os.environ.get("VECTOR_STORE") == 'milvus':
import grpc.experimental.gevent

grpc.experimental.gevent.init_gevent()

import langchain

langchain.verbose = True

import json
Expand Down Expand Up @@ -44,14 +47,15 @@
# DO NOT REMOVE BELOW
from events import event_handlers
from models import account, dataset, model, source, task, tool, tools, web

# DO NOT REMOVE ABOVE


warnings.simplefilter("ignore", ResourceWarning)

# fix windows platform
if os.name == "nt":
os.system('tzutil /s "UTC"')
os.system('tzutil /s "UTC"')
else:
os.environ['TZ'] = 'UTC'
time.tzset()
Expand All @@ -60,13 +64,15 @@
class DifyApp(Flask):
pass


# -------------
# Configuration
# -------------


config_type = os.getenv('EDITION', default='SELF_HOSTED') # ce edition first


# ----------------------------
# Application Factory Function
# ----------------------------
Expand Down Expand Up @@ -192,7 +198,6 @@ def register_blueprints(app):
app = create_app()
celery = app.extensions["celery"]


if app.config['TESTING']:
print("App is running in TESTING mode")

Expand Down
47 changes: 44 additions & 3 deletions api/core/rag/extractor/excel_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Optional

import pandas as pd
import xlrd

from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
Expand All @@ -27,10 +28,37 @@ def __init__(
self._autodetect_encoding = autodetect_encoding

def extract(self) -> list[Document]:
""" parse excel file"""
if self._file_path.endswith('.xls'):
return self._extract4xls()
elif self._file_path.endswith('.xlsx'):
return self._extract4xlsx()

def _extract4xls(self) -> list[Document]:
wb = xlrd.open_workbook(filename=self._file_path)
documents = []
# loop over all sheets
for sheet in wb.sheets():
for row_index, row in enumerate(sheet.get_rows(), start=1):
row_header = None
if self.is_blank_row(row):
continue
if row_header is None:
row_header = row
continue
item_arr = []
for index, cell in enumerate(row):
txt_value = str(cell.value)
item_arr.append(f'{row_header[index].value}:{txt_value}')
item_str = "\n".join(item_arr)
document = Document(page_content=item_str, metadata={'source': self._file_path})
documents.append(document)
return documents

def _extract4xlsx(self) -> list[Document]:
"""Load from file path using Pandas."""
data = []

# 使用 Pandas 读取 Excel 文件的每个工作表
# Read each worksheet of an Excel file using Pandas
xls = pd.ExcelFile(self._file_path)
for sheet_name in xls.sheet_names:
df = pd.read_excel(xls, sheet_name=sheet_name)
Expand All @@ -43,5 +71,18 @@ def extract(self) -> list[Document]:
item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v))
document = Document(page_content=item, metadata={'source': self._file_path})
data.append(document)

return data

@staticmethod
def is_blank_row(row):
"""
Determine whether the specified line is a blank line.
:param row: row object。
:return: Returns True if the row is blank, False otherwise.
"""
# Iterates through the cells and returns False if a non-empty cell is found
for cell in row:
if cell.value is not None and cell.value != '':
return False
return True
4 changes: 2 additions & 2 deletions api/core/rag/extractor/extract_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def extract(cls, extract_setting: ExtractSetting, is_automatic: bool = False,
etl_type = current_app.config['ETL_TYPE']
unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL']
if etl_type == 'Unstructured':
if file_extension == '.xlsx':
if file_extension == '.xlsx' or file_extension == '.xls':
extractor = ExcelExtractor(file_path)
elif file_extension == '.pdf':
extractor = PdfExtractor(file_path)
Expand Down Expand Up @@ -114,7 +114,7 @@ def extract(cls, extract_setting: ExtractSetting, is_automatic: bool = False,
extractor = UnstructuredTextExtractor(file_path, unstructured_api_url) if is_automatic \
else TextExtractor(file_path, autodetect_encoding=True)
else:
if file_extension == '.xlsx':
if file_extension == '.xlsx' or file_extension == '.xls':
extractor = ExcelExtractor(file_path)
elif file_extension == '.pdf':
extractor = PdfExtractor(file_path)
Expand Down
1 change: 1 addition & 0 deletions api/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,4 @@ qrcode~=7.4.2
azure-storage-blob==12.9.0
azure-identity==1.15.0
lxml==5.1.0
xlrd~=2.0.1
5 changes: 3 additions & 2 deletions api/services/file_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif', 'svg']
IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS])

ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'xls', 'docx', 'csv']
UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'xls',
'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml', 'epub']

PREVIEW_WORDS_LIMIT = 3000


Expand Down

0 comments on commit ad65c89

Please sign in to comment.