Skip to content

Commit

Permalink
Merge pull request #117 from marieai/develop
Browse files Browse the repository at this point in the history
Merging after history rewrite on develop
  • Loading branch information
gregbugaj authored Jul 2, 2024
2 parents c5c3947 + 2aea97c commit c98b063
Show file tree
Hide file tree
Showing 111 changed files with 6,579 additions and 2,156 deletions.
2 changes: 1 addition & 1 deletion .run/batch_document_ocr.run.xml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/examples/batch_document_ocr.py" />
<option name="PARAMETERS" value="--config config.dev.json --pipeline default --input ~/tmp/analysis/marie-issues/108/195588965/195588965-0001.png --output_dir ~/tmp/analysis/marie-issues/108/195588965" />
<option name="PARAMETERS" value="--config config.dev.json --pipeline default --input $USER_HOME$/tmp/analysis/oda-research/204019943/PID_900_6705_0_204019943.tif --output_dir $USER_HOME$/tmp/analysis/oda-research/204019943/300dpi" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="marie gateway" type="PythonConfigurationType" factoryName="Python">
<configuration default="false" name="marie gateway - params" type="PythonConfigurationType" factoryName="Python">
<module name="marie-ai" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
Expand All @@ -13,7 +14,7 @@
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/marie/__main__.py" />
<option name="PARAMETERS" value="gateway --protocol GRPC --host 192.168.102.53 --port 52000 --discovery --discovery-host 127.0.0.1 --discovery-port 8500" />
<option name="PARAMETERS" value="gateway --protocols GRPC HTTP --ports 52000 51000 --host 0.0.0.0" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
Expand Down
25 changes: 25 additions & 0 deletions .run/marie gateway-FILE.run.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="marie gateway-FILE" type="PythonConfigurationType" factoryName="Python">
<module name="marie-ai" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/marie/__main__.py" />
<option name="PARAMETERS" value="gateway --uses /mnt/data/marie-ai/config/service/gateway.yml" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
</component>
21 changes: 20 additions & 1 deletion README-GB.md
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,9 @@ https://github.com/fioresxcat/VAT_245/tree/fa526ac7e2ce9bb392ca66bd86305d69caee7
# Table Transformer and Table Detection
https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Table%20Transformer/Using_Table_Transformer_for_table_detection_and_table_structure_recognition.ipynb

# IDEAS
dedoc
https://github.com/ispras/dedoc/blob/master/dedoc/structure_constructors/abstract_structure_constructor.py


https://cloud.google.com/document-ai
Expand Down Expand Up @@ -505,4 +508,20 @@ event.json
act.secrets
```
MARIE_CORE_RELEASE_TOKEN=ghp_ABC
```
```

## Pydantic
```bash
pydantic 1.10.15
pydantic_core 2.10.1
```





# Rewriting history

```bash
git filter-repo --mailmap mailmap --force
```
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,13 @@ aws s3 cp some_file.txt s3://mybucket --profile marie --endpoint-url http://loc
aws s3 --profile marie --endpoint-url=http://127.0.0.1:8000 ls --recursive s3://
```

Remove files from the bucket
```shell
aws s3 rm s3://marie --recursive --profile marie --endpoint-url http://localhost:8000
```



# Production setup


Expand Down
278 changes: 278 additions & 0 deletions config/service/bones.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
jtype: Flow
version: '1'
protocol: grpc

# Shared configuration
shared_config:
storage: &storage
psql: &psql_conf_shared
provider: postgresql
hostname: 127.0.0.1
port: 5432
username: postgres
password: 123456
database: postgres
default_table: shared_docs

message: &message
amazon_mq : &amazon_mq_conf_shared
provider: amazon-rabbitmq
hostname: ${{ ENV.AWS_MQ_HOSTNAME }}
port: 15672
username: ${{ ENV.AWS_MQ_USERNAME }}
password: ${{ ENV.AWS_MQ_PASSWORD }}
tls: True
virtualhost: /


rabbitmq : &rabbitmq_conf_shared
provider: rabbitmq
hostname: ${{ ENV.RABBIT_MQ_HOSTNAME }}
port: ${{ ENV.RABBIT_MQ_PORT }}
username: ${{ ENV.RABBIT_MQ_USERNAME }}
password: ${{ ENV.RABBIT_MQ_PASSWORD }}
tls: False
virtualhost: /


# Toast event tracking system
# It can be backed by Message Queue and Database backed
toast:
native:
enabled: True
path: /tmp/marie/events.json
rabbitmq:
<<: *rabbitmq_conf_shared
enabled : False
psql:
<<: *psql_conf_shared
default_table: event_tracking
enabled : False


# Document Storage
# The storage service is used to store the data that is being processed
# Storage can be backed by S3 compatible

storage:
# S3 configuration. Will be used only if value of backend is "s3"
s3:
enabled: False
metadata_only: False # If True, only metadata will be stored in the storage backend
# api endpoint to connect to. use AWS S3 or any S3 compatible object storage endpoint.
endpoint_url: ${{ ENV.S3_ENDPOINT_URL }}
# optional.
# access key id when using static credentials.
access_key_id: ${{ ENV.S3_ACCESS_KEY_ID }}
# optional.
# secret key when using static credentials.
secret_access_key: ${{ ENV.S3_SECRET_ACCESS_KEY }}
# Bucket name in s3
bucket_name: ${{ ENV.S3_BUCKET_NAME }}
# optional.
# Example: "region: us-east-2"
region: ${{ ENV.S3_REGION }}
# optional.
# enable if endpoint is http
insecure: True
# optional.
# enable if you want to use path style requests
addressing_style: path

# postgresql configuration. Will be used only if value of backend is "psql"
psql:
<<: *psql_conf_shared
default_table: store_metadata
enabled : False

# Job Queue scheduler
scheduler:
psql:
<<: *psql_conf_shared
default_table: job_queue
enabled : True

# FLOW / GATEWAY configuration

with:
port:
- 51000
- 52000
protocol:
- http
- grpc
discovery: True
discovery_host: 127.0.0.1
discovery_port: 8500

host: 127.0.0.1

# monitoring
monitoring: true
port_monitoring: 57843

event_tracking: True

expose_endpoints:
/document/extract:
methods: ["POST"]
summary: Extract data-POC
tags:
- extract
/status:
methods: ["POST"]
summary: Status
tags:
- extract

/text/status:
methods: ["POST"]
summary: Extract data
tags:
- extract

/ner/extract:
methods: ["POST"]
summary: Extract NER
tags:
- ner

/document/classify:
methods: ["POST"]
summary: Classify document at page level
tags:
- classify

prefetch: 4

executors:
# - name: extract_executor
# uses:
# jtype: TextExtractionExecutorMock
# metas:
# py_modules:
# - marie.executor.text
# timeout_ready: 3000000
# replicas: 1
## replicas: ${{ CONTEXT.gpu_device_count }}
# env :
# CUDA_VISIBLE_DEVICES: RR

- name: extract_t
uses:
jtype: TextExtractionExecutor
# jtype: TextExtractionExecutorMock
with:
storage:
# postgresql configuration. Will be used only if value of backend is "psql"
psql:
<<: *psql_conf_shared
default_table: extract_metadata
enabled: True
pipeline:
name: 'default'
page_classifier:
- model_name_or_path: 'marie/lmv3-medical-document-classification'
type: 'transformers'
device: 'cuda'
enabled: False
name: 'medical_page_classifier'
- model_name_or_path: 'marie/lmv3-medical-document-payer'
type: 'transformers'
enabled: False
device: 'cuda'
name: 'medical_payer_classifier'
page_indexer:
- model_name_or_path: 'marie/layoutlmv3-medical-document-indexer'
enabled: False
type: 'transformers'
device: 'cuda'
name: 'page_indexer_patient'
filter:
type: 'regex'
pattern: '.*'
page_splitter:
model_name_or_path: 'marie/layoutlmv3-medical-document-splitter'
enabled: True
metas:
py_modules:
- marie.executor.text
timeout_ready: 3000000
replicas: 1
# replicas: ${{ CONTEXT.gpu_device_count }}
env:
CUDA_VISIBLE_DEVICES: RR

# - name: extract_xyz
# uses:
# jtype: TextExtractionExecutorMock
# metas:
# py_modules:
# - marie.executor.text
# timeout_ready: 3000000
# replicas: 1
## replicas: ${{ CONTEXT.gpu_device_count }}
# env :
# CUDA_VISIBLE_DEVICES: RR

# - name: ner_t
# uses:
# jtype: NerExtractionExecutor
# with:
# model_name_or_path : 'rms/layoutlmv3-large-corr-ner'
# <<: *psql_conf_shared
# storage_enabled : False
# metas:
# py_modules:
## - marie_server.executors.ner.mserve_torch
# - marie.executor.ner
# timeout_ready: 3000000
## replicas: 1
# replicas: ${{ CONTEXT.gpu_device_count }}
# env :
# CUDA_VISIBLE_DEVICES: RR

# - name: document_classifier
# uses:
# jtype: DocumentClassificationExecutor
# with:
# model_name_or_path :
# - 'marie/layoutlmv3-document-classification'
# - 'marie/layoutlmv3-document-classification'
# <<: *psql_conf_shared
# storage_enabled : False
# metas:
# py_modules:
# - marie.executor.classifier
# timeout_ready: 3000000
## replicas: 1
# replicas: ${{ CONTEXT.gpu_device_count }}
# env :
# CUDA_VISIBLE_DEVICES: RR
##
# - name: overlay_t
# uses:
# jtype: OverlayExecutor
# with:
# model_name_or_path : 'rms/holder'
# <<: *storage_conf
# storage_enabled : True
# metas:
# py_modules:
# - marie.executor.overlay
# timeout_ready: 3000000
# replicas: 1

# Authentication and Authorization configuration

auth:
keys:
- name : service-A
api_key : mas_0aPJ9Q9nUO1Ac1vJTfffXEXs9FyGLf9BzfYgZ_RaHm707wmbfHJNPQ
enabled : True
roles : [admin, user]

- name : service-B
api_key : mau_t6qDi1BcL1NkLI8I6iM8z1va0nZP01UQ6LWecpbDz6mbxWgIIIZPfQ
enabled : True
roles : [admin, user]
Loading

0 comments on commit c98b063

Please sign in to comment.