veld_publish_to_hf.yaml

x-veld:
  code:
    description: "simple service to push spacy models to huggingface. Important: Only works from 
      spacy v3.* onwards!"
    topic:
      - "NLP"
      - "ETL"

    input:
      - volume: /veld/input/
        file_type: "spaCy model"
        content: "NLP model"

    config:
      - environment_var: model_name
        description: "name of the model, to be used for huggingface metadata. IMPORTANT: do not put 
          double underscores into the model name, as this crashes spacy while publishing."
        var_type: "str"
        optional: true
      - environment_var: version
        description: "version of the model, to be used for huggingface metadata. IMPORTANT: spacy 
          crashes when the version tag contains the character `v` in front of numeric+dot version 
          identifiers: E.g. `v1.1` crashes, while `1.1` works."
        var_type: "str"
        optional: true
      - environment_var: hf_token
        description: "huggingface authentication token. 

          IMPORTANT: DON'T HARDCODE THE TOKEN HERE!
         
          Rather define this environment variable on the host, by either:
          - PREFERRED: do this manually before calling this docker compose service. On linux and 
            mac, this can be done with `export hf_token=<TOKEN>` and on windows with 
            `set hf_token=<TOKEN>` before launching a docker compose service.
          - LESS PREFERRED: only do this if you know .gitignore: you may persist it in a `.env` 
            file (with the content simply being `hf_token=<TOKEN>`) file next to the chain 
            veld yaml file, which docker would automatically load, and then add the `.env` to 
            `.gitignore`!"
        var_type: "str"

services:
  veld_publish_to_hf:
    build: .
    command: bash /veld/code/publish_to_hf.sh
    volumes:
      - ./src/:/veld/code/
    environment:
      model_name: null
      version: null
      hf_token: $hf_token