diff --git a/Dockerfile b/Dockerfile index f96aae2..66362df 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,6 +38,7 @@ RUN python3.10 download_models.py #serverless # CMD ["sh", "-c", "ls && python3.10 serverless.py"] +#download paddleocr model RUN sh download_model.sh CMD ["python3.10", "-m", "app.serverless"] diff --git a/app/serverless.py b/app/serverless.py index 51d16b0..fcf838b 100644 --- a/app/serverless.py +++ b/app/serverless.py @@ -60,6 +60,28 @@ def setup(): # Warm up the conversion process convert_to_markdown(sample_pdf_bytes, sample_dir, filename) + + # def get_model( + # self, + # ocr: bool, + # show_log: bool, + # lang=None, + # layout_model=None, + # formula_enable=None, + # table_enable=None, + # ): + #layout_model: doclayout_yolo, apply_formula: True, apply_ocr: True, apply_table: False, table_model: rapid_table, lang: None +def init_model(): + from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton + model_manager = ModelSingleton() + print("About to init model") + txt_model = model_manager.get_model(False, False) # noqa: F841 + print('txt_model init final') + ocr_model = model_manager.get_model(True, False) # noqa: F841 + print('ocr_model init final') + return 0 + + def handler(event): try: # Extract base64 encoded file and filename from the event @@ -87,4 +109,6 @@ def handler(event): # Call setup to initiate and warm up resources # setup() +init_model() + runpod.serverless.start({"handler": handler}) \ No newline at end of file diff --git a/download_model.sh b/download_model.sh index 6a48285..2044150 100755 --- a/download_model.sh +++ b/download_model.sh @@ -14,10 +14,10 @@ download_and_extract() { echo "Downloading ${model_name} from ${url}" | tee -a "$LOG_FILE" - # **Create the destination directory before downloading** + # Create the destination directory before downloading mkdir -p "$(dirname "${destination_path}")" - # **Download the file directly to destination_path without appending .tar** + # Download the file directly to destination_path without appending .tar if ! curl -L -o "${destination_path}" "${url}" 2>&1 | tee -a "$LOG_FILE"; then echo "Failed to download ${model_name} from ${url}" | tee -a "$LOG_FILE" exit 1 @@ -25,7 +25,7 @@ download_and_extract() { echo "Extracting ${model_name}" | tee -a "$LOG_FILE" - # **Extract the downloaded tar file** + # Extract the downloaded tar file if ! tar -xvf "${destination_path}" -C "$(dirname "${destination_path}")" 2>&1 | tee -a "$LOG_FILE"; then echo "Failed to extract ${model_name}" | tee -a "$LOG_FILE" exit 1 @@ -33,7 +33,7 @@ download_and_extract() { echo "Removing archive for ${model_name}" | tee -a "$LOG_FILE" - # **Remove the downloaded tar file** + # Remove the downloaded tar file if ! rm "${destination_path}" | tee -a "$LOG_FILE"; then echo "Failed to remove archive for ${model_name}" | tee -a "$LOG_FILE" exit 1 @@ -43,19 +43,16 @@ download_and_extract() { echo "----------------------------------------" | tee -a "$LOG_FILE" } -# Download PP Detect Model to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ +# Corrected download paths without double directories download_and_extract "https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_infer.tar" \ - "/root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar" \ + "/root/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer.tar" \ "PP Detect Model" +download_and_extract "https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar" \ + "/root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar" \ + "PP Class Model" -# Download PP Rec Model to /root/.paddleocr/whl/rec/ch_ppocr_mobile_v2.0_rec_infer/ download_and_extract "https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_infer.tar" \ - "/root/.paddleocr/whl/rec/ch_ppocr_mobile_v2.0_rec_infer/ch_ppocr_mobile_v2.0_rec_infer.tar" \ + "/root/.paddleocr/whl/rec/ch/ch_ppocr_mobile_v2.0_rec_infer.tar" \ "PP Rec Model" -# **New Addition**: Download PP Rec Model to /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer/ -download_and_extract "https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_infer.tar" \ - "/root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer/ch_PP-OCRv4_rec_infer.tar" \ - "PP Rec Model Additional Path" - echo "All model downloads completed at $(date)" | tee -a "$LOG_FILE" \ No newline at end of file