NERSC_Cori_Raythena/pilot/default.cfg

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Authors:
# - Daniel Drizhuk, d.drizhuk@gmail.com, 2017
# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2019


################################
# Experiment specific parameters

[Experiment]

name: ATLAS


################################
# Pilot parameters

[Pilot]

# The default file name for the pilot log
pilotlog: pilotlog.txt
stageinlog: stageinlog.txt
stageoutlog: stageoutlog.txt

# The file name for the job definition
pandajobdata: pandaJobData.out

# Run with a fake test job, no server updates (values: 'fake', 'real'). The test job type can be 'production' or 'user'.
# The test transfer type can be 'direct' or 'NULL'. Test job command can be 'normal' or 'sleep' (normal means standard
# reconstruction job, while sleep means that the payload command is 'sleep 1' and no input or output transfers).
pandajob: real
testjobtype: production
testjobcommand: normal
testtransfertype: NULL

# The URL for the PanDA server
pandaserver: http://127.0.0.1:8080
# pandaserver: https://aipanda007.cern.ch:25443

# The heartbeat period in seconds (30*60 = 1800 s in normal mode, 5 * 60=300 s in ddebug mode)
heartbeat: 1800
debug_heartbeat: 300

# Heartbeat message file (only used when Pilot is not sending heartbeats to server)
heartbeat_message: heartbeat.json

# Job IDs can be stored to a file that is picked up by the wrapper
jobid_file: pandaIDs.out

# The minimum required disk space for the pilot to run a job
free_space_limit: 2 GB

# The maximum output file size
maximum_output_file_size: 500 GB

# The maximum allowed sum of all input files (files accessed by direct access not counted by pilot)
# (fall-back value, schedconfig value is primarily used)
maximum_input_file_sizes: 14336 MB

# Size limit of payload stdout size during running. unit is in kB (value = 2 * 1024 ** 2)
local_size_limit_stdout: 2097152

# The maximum number of getJob requests
maximum_getjob_requests: 1

# Looping job time limits; if job does not write anything in N hours, it is considered a looping job
looping_verification_time: 600
# for production jobs, 12*3600
looping_limit_default_prod: 43200
# for user jobs, 3*3600
looping_limit_default_user: 10800
# The minimum allowed looping limit, 2*3600
looping_limit_min_default: 7200

# Proxy verification time (used by monitoring) in seconds
proxy_verification_time: 600

# Disk space monitoring
disk_space_verification_time: 300

# Memory usage verification time (how often the memory monitor output will be checked)
memory_usage_verification_time: 60

# Process verification time
process_verification_time: 300

# Output file size verification time
output_verification_time: 300

# The default thread check time in seconds, used by thread monitoring
thread_check: 10

# The default CPU check time in seconds, used by CPU monitoring
cpu_check: 60

# The timing file used to store various timing measurements
timing_file: pilot_timing.json

# Optional error log (leave filename empty if not wanted)
error_log: piloterrorlog.txt

# List of redundant files and directories to be removed prior to log file creation
# For ATLAS, any initial /cvmfs bit will automatically be corrected if ATLAS_LOCAL_ROOT_BASE is set
redundant: /cvmfs/atlas.cern.ch/repo/sw/PandaPilot/config/redundant.txt

# Utility commands that may be launched by the pilot before payload, with payload, after payload or with stagein
# E.g. MemoryMonitor is used as an internal name. The actual command is 'prmon'
utility_before_payload:
utility_with_payload:
utility_after_payload_started:
utility_with_stagein:

################################
# Information service parameters

[Information]

# Path to local cache
#cache_dir:  /lustre/atlas/proj-shared/csc108/debug/atlas/HPC_pilot_test/queue_cache #for Titan
cache_dir:

# URL for the PanDA queues json
queues: http://atlas-agis-api.cern.ch/request/pandaqueue/query/list/?json

# URL for the sites json
sites: http://atlas-agis-api.cern.ch/request/site/query/list/?json

# URL for the DDM endpoints json
storages: http://atlas-agis-api.cern.ch/request/ddmendpoint/query/list/?json

# URL for the SchedConfig json
schedconfig: http://pandaserver.cern.ch:25085/cache/schedconfig

# File name for the queuedata json
queuedata: queuedata.json

# overwrite acopytools for queuedata
#acopytools: {'pr':['rucio']}
#acopytools: {'pr':['rucio'], 'pw':['gfalcopy'], 'pl':['gfalcopy']}
#acopytools: {'pr': ['lsm'], 'pw': ['lsm']}

################################
# Payload parameters

[Payload]

# File name for the job report produced by the payload
jobreport: jobReport.json

# File name for production job metadata
metadata: metadata.xml

# File names for stdout/stderr
payloadstdout: payload.stdout
payloadstderr: payload.stderr

# Event service executor type
# default: generic (alternatives: base, raythena)
executor_type: raythena

################################
# Container parameters

[Container]

# Master parameter (unused)
# Is the pilot allowed to use containers? If False, then any database settings are ignored
# allow_container: False

# The setup type can be either ALRB or (explicit) singularity
setup_type: ALRB

# Name of script file that will contain the payload command to be executed in the container
container_script: container_script.sh

# Name of script file that will contain the setup command for the payload to be executed in the container
release_setup: my_release_setup.sh

# Name of the file that will contain the payload pid
pid_file: pid.txt

# Execute middleware in container
use_middleware_container: False

# If a middleware container script is listed (e.g. stagein.py), the pilot will perform all stage-in and/or stage-out
# steps in a standard container (to be revised).
# Note: if no middleware container image is specified below, the middleware will still be executed by the specified script
# (without using a container).
middleware_container_stagein_script: stagein.py
#middleware_container_stagein_script:
middleware_container_stageout_script:
# error information and stage-in file status is saved in a json file by the stage-in script and later read by the pilot
stagein_dictionary: stagein_dictionary.json
middleware_stagein_stdout: stagein_stdout.txt
middleware_stagein_stderr: stagein_stderr.txt

# Name of middleware image (to be revised)
# This image is used if middleware is not found locally on the worker node. Middleware is expected to be present
# in the container image
middleware_container:

################################
# Harvester parameters

[Harvester]

# Name of the job request file. The pilot places this file in the pilot launch directory when it wants Harvester
# to send another job (placed by Harvester in the same directory)
job_request_file: worker_requestjob.json

# Name of the kill worker file. The pilot places this file in the pilot launch directory when it has finished all jobs
# and wants Harvester to kill the worker (virtual machine)
kill_worker_file: kill_worker

# Name of file with list of IDs of PanDA jobs to be processed by HPC Pilot
jobs_list_file: worker_pandaids.json

# Name of file with PanDA job to be processed by HPC Pilot
pandajob_file: HPCJobs.json

# Name of file with worker report
workerAttributesFile: worker_attributes.json

# Name of file for declaration of stageout
StageOutnFile: event_status.dump.json

################################
# HPC parameters

[HPC]

# Path to scratch disk (RAM, SSD etc) for placing of job working directory
scratch: /tmp/scratch/

################################
# Rucio parameters

[Rucio]

# Rucio server URL for traces
url: https://rucio-lb-prod.cern.ch/traces/