Source code for toil.worker

# Copyright (C) 2015-2021 Regents of the University of California
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import base64
import copy
import json
import logging
import os
import pickle
import random
import shutil
import signal
import socket
import stat
import sys
import time
import traceback
from contextlib import contextmanager
from typing import Any, Callable, Iterator, List, Optional

from toil import logProcessContext
from toil.common import Config, Toil, safeUnpickleFromStream
from toil.cwl.utils import (CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION,
                            CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE)
from toil.deferred import DeferredFunctionManager
from toil.fileStores.abstractFileStore import AbstractFileStore
from toil.job import CheckpointJobDescription, Job, JobDescription
from toil.jobStores.abstractJobStore import AbstractJobStore
from toil.lib.expando import MagicExpando
from toil.lib.io import make_public_dir
from toil.lib.resources import (get_total_cpu_time,
                                get_total_cpu_time_and_memory_usage)
from toil.statsAndLogging import configure_root_logger, set_log_level

logger = logging.getLogger(__name__)


[docs]class StatsDict(MagicExpando):
    """Subclass of MagicExpando for type-checking purposes."""

    jobs: List[str]


[docs]def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, config: Config) -> Optional[JobDescription]:
    """
    Returns the next chainable job's JobDescription after the given predecessor
    JobDescription, if one exists, or None if the chain must terminate.

    :param predecessor: The job to chain from
    :param jobStore: The JobStore to fetch JobDescriptions from.
    :param config: The configuration for the current run.
    """
    #If no more jobs to run or services not finished, quit
    if predecessor.nextSuccessors() is None or len(predecessor.services) > 0 or (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint != None):
        logger.debug("Stopping running chain of jobs: no successors: %s, services: %s, checkpoint: %s",
                     predecessor.nextSuccessors() is None, len(predecessor.services), (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint != None))
        return None


    #Get the next set of jobs to run
    jobs = list(predecessor.nextSuccessors())
    if len(jobs) == 0:
        # If there are no jobs, we might just not have any children.
        logger.debug("Stopping running chain of jobs because job has no ready children or follow-ons")
        return None

    #If there are 2 or more jobs to run in parallel we quit
    if len(jobs) >= 2:
        logger.debug("No more jobs can run in series by this worker,"
                    " it's got %i successors", len(jobs))
        logger.debug("Two distinct successors are %s and %s", jobs[0], jobs[1])
        return None

    # Grab the only job that should be there.
    successorID = jobs[0]

    logger.debug("%s would chain to ID %s", predecessor, successorID)

    # Load the successor JobDescription
    successor = jobStore.load_job(successorID)

    #We check the requirements of the successor to see if we can run it
    #within the current worker
    if successor.memory > predecessor.memory:
        logger.debug("We need more memory for the next job, so finishing")
        return None
    if successor.cores > predecessor.cores:
        logger.debug("We need more cores for the next job, so finishing")
        return None
    if successor.disk > predecessor.disk:
        logger.debug("We need more disk for the next job, so finishing")
        return None
    if successor.preemptible != predecessor.preemptible:
        logger.debug("Preemptibility is different for the next job, returning to the leader")
        return None
    if successor.predecessorNumber > 1:
        logger.debug("The next job has multiple predecessors; we must return to the leader.")
        return None

    if len(successor.services) > 0:
        logger.debug("The next job requires services that will not yet be started; we must return to the leader.")
        return None

    if isinstance(successor, CheckpointJobDescription):
        # Check if job is a checkpoint job and quit if so
        logger.debug("Next job is checkpoint, so finishing")
        return None

    # Made it through! This job is chainable.
    return successor

[docs]def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobStoreID: str, redirectOutputToLogFile: bool = True) -> int:
    """
    Worker process script, runs a job.

    :param jobStore: The JobStore to fetch JobDescriptions from.
    :param config: The configuration for the current run.
    :param jobName: The "job name" (a user friendly name) of the job to be run
    :param jobStoreID: The job store ID of the job to be run

    :return int: 1 if a job failed, or 0 if all jobs succeeded
    """

    configure_root_logger()
    set_log_level(config.logLevel)

    ##########################################
    #Create the worker killer, if requested
    ##########################################

    logFileByteReportLimit = config.maxLogFileSize

    if config.badWorker > 0 and random.random() < config.badWorker:
        # We need to kill the process we are currently in, to simulate worker
        # failure. We don't want to just send SIGKILL, because we can't tell
        # that from a legitimate OOM on our CI runner. We're going to send
        # SIGUSR1 so our terminations are distinctive, and then SIGKILL if that
        # didn't stick. We definitely don't want to do this from *within* the
        # process we are trying to kill, so we fork off. TODO: We can still
        # leave the killing code running after the main Toil flow is done, but
        # since it's now in a process instead of a thread, the main Python
        # process won't wait around for its timeout to expire. I think this is
        # better than the old thread-based way where all of Toil would wait
        # around to be killed.

        killTarget = os.getpid()
        sleepTime = config.badWorkerFailInterval * random.random()
        if os.fork() == 0:
            # We are the child
            # Let the parent run some amount of time
            time.sleep(sleepTime)
            # Kill it gently
            os.kill(killTarget, signal.SIGUSR1)
            # Wait for that to stick
            time.sleep(0.01)
            try:
                # Kill it harder. Hope the PID hasn't already been reused.
                # If we succeeded the first time, this will OSError
                os.kill(killTarget, signal.SIGKILL)
            except OSError:
                pass
            # Exit without doing any of Toil's cleanup
            os._exit(0)

        # We don't need to reap the child. Either it kills us, or we finish
        # before it does. Either way, init will have to clean it up for us.

    ##########################################
    #Load the environment for the job
    ##########################################

    #First load the environment for the job.
    with jobStore.read_shared_file_stream("environment.pickle") as fileHandle:
        environment = safeUnpickleFromStream(fileHandle)
    env_reject = {
        "TMPDIR",
        "TMP",
        "HOSTNAME",
        "HOSTTYPE",
        "HOME",
        "LOGNAME",
        "USER",
        "DISPLAY",
        "JAVA_HOME",
        "XDG_SESSION_TYPE",
        "XDG_SESSION_CLASS",
        "XDG_SESSION_ID",
        "XDG_RUNTIME_DIR",
        "XDG_DATA_DIRS",
        "DBUS_SESSION_BUS_ADDRESS"
    }
    for i in environment:
        if i == "PATH":
            # Handle path specially. Sometimes e.g. leader may not include
            # /bin, but the Toil appliance needs it.
            if i in os.environ and os.environ[i] != '':
                # Use the provided PATH and then the local system's PATH
                os.environ[i] = environment[i] + ':' + os.environ[i]
            else:
                # Use the provided PATH only
                os.environ[i] = environment[i]
        elif i not in env_reject:
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)

    ##########################################
    #Setup the temporary directories.
    ##########################################
    # Dir to put all this worker's temp files in.
    assert config.workflowID is not None
    toilWorkflowDir = Toil.getLocalWorkflowDir(config.workflowID, config.workDir)
    # Dir to put lock files in, ideally not on NFS.
    toil_coordination_dir = Toil.get_local_workflow_coordination_dir(config.workflowID, config.workDir, config.coordination_dir)
    localWorkerTempDir = make_public_dir(in_directory=toilWorkflowDir)
    os.chmod(localWorkerTempDir, 0o755)

    ##########################################
    #Setup the logging
    ##########################################

    #This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>

    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    # Do we even want to redirect output? Let the config make us not do it.
    redirectOutputToLogFile = redirectOutputToLogFile and not config.disableWorkerOutputCapture

    #What file do we want to point FDs 1 and 2 to?
    tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")

    if redirectOutputToLogFile:
        # Announce that we are redirecting logging, and where it will now go.
        # This is important if we are trying to manually trace a faulty worker invocation.
        logger.info("Redirecting logging to %s", tempWorkerLogPath)
        sys.stdout.flush()
        sys.stderr.flush()

        # Save the original stdout and stderr (by opening new file descriptors
        # to the same files)
        origStdOut = os.dup(1)
        origStdErr = os.dup(2)

        # Open the file to send stdout/stderr to.
        logFh = os.open(tempWorkerLogPath, os.O_WRONLY | os.O_CREAT | os.O_APPEND)

        # Replace standard output with a descriptor for the log file
        os.dup2(logFh, 1)

        # Replace standard error with a descriptor for the log file
        os.dup2(logFh, 2)

        # Since we only opened the file once, all the descriptors duped from
        # the original will share offset information, and won't clobber each
        # others' writes. See <http://stackoverflow.com/a/5284108/402891>. This
        # shouldn't matter, since O_APPEND seeks to the end of the file before
        # every write, but maybe there's something odd going on...

        # Close the descriptor we used to open the file
        os.close(logFh)

    debugging = logging.getLogger().isEnabledFor(logging.DEBUG)
    ##########################################
    # Worker log file trapped from here on in
    ##########################################

    jobAttemptFailed = False
    failure_exit_code = 1
    statsDict = StatsDict()  # type: ignore[no-untyped-call]
    statsDict.jobs = []
    statsDict.workers.logsToMaster = []

    def blockFn() -> bool:
        return True
    listOfJobs = [jobName]
    job = None
    try:

        # Put a message at the top of the log, just to make sure it's working.
        logger.info("---TOIL WORKER OUTPUT LOG---")
        sys.stdout.flush()

        logProcessContext(config)

        ##########################################
        # Connect to the deferred function system
        ##########################################
        deferredFunctionManager = DeferredFunctionManager(toil_coordination_dir)

        ##########################################
        # Load the JobDescription
        ##########################################

        jobDesc = jobStore.load_job(jobStoreID)
        listOfJobs[0] = str(jobDesc)
        logger.debug("Parsed job description")

        ##########################################
        # Cleanup from any earlier invocation of the job
        ##########################################

        if jobDesc.command is None:
            logger.debug("Job description has no body to run.")
            # Cleanup jobs already finished
            jobDesc.clear_nonexistent_dependents(jobStore)
            logger.debug("Cleaned up any references to completed successor jobs")

        # This cleans the old log file which may
        # have been left if the job is being retried after a job failure.
        oldLogFile = jobDesc.logJobStoreFileID
        if oldLogFile is not None:
            jobDesc.logJobStoreFileID = None
            jobStore.update_job(jobDesc)  # Update first, before deleting any files
            jobStore.delete_file(oldLogFile)

        ##########################################
        # If a checkpoint exists, restart from the checkpoint
        ##########################################

        if isinstance(jobDesc, CheckpointJobDescription) and jobDesc.checkpoint is not None:
            # The job is a checkpoint, and is being restarted after previously completing
            logger.debug("Job is a checkpoint")
            # If the checkpoint still has extant successors or services, its
            # subtree didn't complete properly. We handle the restart of the
            # checkpoint here, removing its previous subtree.
            if next(jobDesc.successorsAndServiceHosts(), None) is not None:
                logger.debug("Checkpoint has failed; restoring")
                # Reduce the try count
                assert jobDesc.remainingTryCount >= 0
                jobDesc.remainingTryCount = max(0, jobDesc.remainingTryCount - 1)
                jobDesc.restartCheckpoint(jobStore)
            # Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean
            # because of the job being a checkpoint
            else:
                logger.debug("The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete.")
                #Delete any remnant files
                list(map(jobStore.delete_file, list(filter(jobStore.file_exists, jobDesc.checkpointFilesToDelete))))

        ##########################################
        #Setup the stats, if requested
        ##########################################

        if config.stats:
            startClock = get_total_cpu_time()

        startTime = time.time()
        while True:
            ##########################################
            #Run the job body, if there is one
            ##########################################

            logger.info("Working on job %s", jobDesc)

            if jobDesc.command is not None:
                assert jobDesc.command.startswith("_toil ")
                logger.debug("Got a command to run: %s" % jobDesc.command)
                # Load the job. It will use the same JobDescription we have been using.
                job = Job.loadJob(jobStore, jobDesc)
                if isinstance(jobDesc, CheckpointJobDescription):
                    # If it is a checkpoint job, save the command
                    jobDesc.checkpoint = jobDesc.command

                logger.info("Loaded body %s from description %s", job, jobDesc)

                # Create a fileStore object for the job
                fileStore = AbstractFileStore.createFileStore(jobStore, jobDesc, localWorkerTempDir, blockFn,
                                                              caching=config.caching)
                with job._executor(stats=statsDict if config.stats else None,
                                   fileStore=fileStore):
                    with deferredFunctionManager.open() as defer:
                        with fileStore.open(job):
                            # Get the next block function to wait on committing this job
                            blockFn = fileStore.waitForCommit

                            # Run the job, save new successors, and set up
                            # locally (but don't commit) successor
                            # relationships and job completion.
                            # Pass everything as name=value because Cactus
                            # likes to override _runner when it shouldn't and
                            # it needs some hope of finding the arguments it
                            # wants across multiple Toil versions. We also
                            # still pass a jobGraph argument to placate old
                            # versions of Cactus.
                            job._runner(jobGraph=None, jobStore=jobStore, fileStore=fileStore, defer=defer)

                # Accumulate messages from this job & any subsequent chained jobs
                statsDict.workers.logsToMaster += fileStore.loggingMessages

                logger.info("Completed body for %s", jobDesc)

            else:
                #The command may be none, in which case
                #the JobDescription is either a shell ready to be deleted or has
                #been scheduled after a failure to cleanup
                logger.debug("No user job to run, so finishing")
                break

            if AbstractFileStore._terminateEvent.is_set():
                raise RuntimeError("The termination flag is set")

            ##########################################
            #Establish if we can run another job within the worker
            ##########################################
            successor = nextChainable(jobDesc, jobStore, config)
            if successor is None or config.disableChaining:
                # Can't chain any more jobs. We are going to stop.

                logger.info("Not chaining from job %s", jobDesc)

                # TODO: Somehow the commit happens even if we don't start it here.

                break

            logger.info("Chaining from %s to %s", jobDesc, successor)

            ##########################################
            # We have a single successor job that is not a checkpoint job. We
            # reassign the ID of the current JobDescription to the successor.
            # We can then delete the successor JobDescription (under its old
            # ID) in the jobStore, as it is wholly incorporated into the
            # current one.
            ##########################################

            # Make sure nothing has gone wrong and we can really chain
            assert jobDesc.memory >= successor.memory
            assert jobDesc.cores >= successor.cores

            # Save the successor's original ID, so we can clean it (and its
            # body) up after we finish executing it.
            successorID = successor.jobStoreID

            # add the successor to the list of jobs run
            listOfJobs.append(str(successor))

            # Now we need to become that successor, under the original ID.
            successor.replace(jobDesc)
            jobDesc = successor

            # Problem: successor's job body is a file that will be cleaned up
            # when we delete the successor job by ID. We can't just move it. So
            # we need to roll up the deletion of the successor job by ID with
            # the deletion of the job ID we're currently working on.
            jobDesc.jobsToDelete.append(successorID)

            # Clone the now-current JobDescription (which used to be the successor).
            # TODO: Why??? Can we not?
            jobDesc = copy.deepcopy(jobDesc)

            # Build a fileStore to update the job and commit the replacement.
            # TODO: can we have a commit operation without an entire FileStore???
            fileStore = AbstractFileStore.createFileStore(jobStore, jobDesc, localWorkerTempDir, blockFn,
                                                          caching=config.caching)

            # Update blockFn to wait for that commit operation.
            blockFn = fileStore.waitForCommit

            # This will update the job once the previous job is done updating
            fileStore.startCommit(jobState=True)

            # Clone the current job description again, so that further updates
            # to it (such as new successors being added when it runs) occur
            # after the commit process we just kicked off, and aren't committed
            # early or partially.
            jobDesc = copy.deepcopy(jobDesc)
            # Bump its version since saving will do that too and we don't want duplicate versions.
            jobDesc.pre_update_hook()


            logger.debug("Starting the next job")

        ##########################################
        #Finish up the stats
        ##########################################
        if config.stats:
            totalCPUTime, totalMemoryUsage = get_total_cpu_time_and_memory_usage()
            statsDict.workers.time = str(time.time() - startTime)
            statsDict.workers.clock = str(totalCPUTime - startClock)
            statsDict.workers.memory = str(totalMemoryUsage)

        # log the worker log path here so that if the file is truncated the path can still be found
        if redirectOutputToLogFile:
            logger.info("Worker log can be found at %s. Set --cleanWorkDir to retain this log", localWorkerTempDir)

        logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime)

    ##########################################
    #Trapping where worker goes wrong
    ##########################################
    except Exception as e: #Case that something goes wrong in worker
        traceback.print_exc()
        logger.error("Exiting the worker because of a failed job on host %s", socket.gethostname())
        if isinstance(e, CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION):
            # We need to inform the leader that this is a CWL workflow problem
            # and it needs to inform its caller.
            failure_exit_code = CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
        AbstractFileStore._terminateEvent.set()
    finally:
        # Get rid of our deferred function manager now so we can't mistake it
        # for someone else's if we do worker cleanup.
        del deferredFunctionManager
        try:
            import cwltool.main

            cwltool.main._terminate_processes()
        except (ImportError, ModuleNotFoundError):
            pass
        except Exception as e:
            logger.debug("cwltool.main._terminate_processess exception: %s", (e))
            raise e


    ##########################################
    #Wait for the asynchronous chain of writes/updates to finish
    ##########################################

    blockFn()

    ##########################################
    #All the asynchronous worker/update threads must be finished now,
    #so safe to test if they completed okay
    ##########################################

    if AbstractFileStore._terminateEvent.is_set():
        # Something has gone wrong.

        # Clobber any garbage state we have for this job from failing with
        # whatever good state is still stored in the JobStore
        jobDesc = jobStore.load_job(jobStoreID)
        # Remember that we failed
        jobAttemptFailed = True

    ##########################################
    #Cleanup
    ##########################################

    # Close the worker logging
    # Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    if redirectOutputToLogFile:
        # Flush at the OS level
        os.fsync(1)
        os.fsync(2)

        # Close redirected stdout and replace with the original standard output.
        os.dup2(origStdOut, 1)

        # Close redirected stderr and replace with the original standard error.
        os.dup2(origStdErr, 2)

        # sys.stdout and sys.stderr don't need to be modified at all. We don't
        # need to call redirectLoggerStreamHandlers since they still log to
        # sys.stderr

        # Close our extra handles to the original standard output and standard
        # error streams, so we don't leak file handles.
        os.close(origStdOut)
        os.close(origStdErr)

    # Now our file handles are in exactly the state they were in before.

    # Copy back the log file to the global dir, if needed.
    # Note that we work with bytes instead of characters so we can seek
    # relative to the end (since Python won't decode Unicode backward, or even
    # interpret seek offsets in characters for us). TODO: We may get invalid or
    # just different Unicode by breaking up a character at the boundary!
    if jobAttemptFailed and redirectOutputToLogFile:
        jobDesc.logJobStoreFileID = logJobStoreFileID = jobStore.getEmptyFileStoreID(
            jobDesc.jobStoreID, cleanup=True
        )
        jobDesc.chainedJobs = listOfJobs
        with jobStore.update_file_stream(logJobStoreFileID) as w:
            with open(tempWorkerLogPath, 'rb') as f:
                if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit !=0:
                    if logFileByteReportLimit > 0:
                        f.seek(-logFileByteReportLimit, 2)  # seek to last tooBig bytes of file
                    elif logFileByteReportLimit < 0:
                        f.seek(logFileByteReportLimit, 0)  # seek to first tooBig bytes of file
                # Dump the possibly-invalid-Unicode bytes into the log file
                w.write(f.read()) # TODO load file using a buffer
        # Commit log file reference back to JobStore
        jobStore.update_job(jobDesc)

    elif ((debugging or (config.writeLogsFromAllJobs and not jobDesc.local))
          and redirectOutputToLogFile):  # write log messages
        with open(tempWorkerLogPath, 'rb') as logFile:
            if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0:
                if logFileByteReportLimit > 0:
                    logFile.seek(-logFileByteReportLimit, 2)  # seek to last tooBig bytes of file
                elif logFileByteReportLimit < 0:
                    logFile.seek(logFileByteReportLimit, 0)  # seek to first tooBig bytes of file
            # Make sure lines are Unicode so they can be JSON serialized as part of the dict.
            # We may have damaged the Unicode text by cutting it at an arbitrary byte so we drop bad characters.
            logMessages = [line.decode('utf-8', 'skip') for line in logFile.read().splitlines()]
        statsDict.logs.names = listOfJobs
        statsDict.logs.messages = logMessages

    if (debugging or config.stats or statsDict.workers.logsToMaster) and not jobAttemptFailed:  # We have stats/logging to report back
        jobStore.write_logs(json.dumps(statsDict, ensure_ascii=True))

    # Remove the temp dir
    cleanUp = config.cleanWorkDir
    if cleanUp == 'always' or (cleanUp == 'onSuccess' and not jobAttemptFailed) or (cleanUp == 'onError' and jobAttemptFailed):
        def make_parent_writable(func: Callable[[str], Any], path: str, _: Any) -> None:
            """
            When encountering an error removing a file or directory, make sure
            the parent directory is writable.

            cwltool likes to lock down directory permissions, and doesn't clean
            up after itself.
            """
            # Just chmod it for rwx for user. This can't work anyway if it isn't ours.
            try:
                os.chmod(os.path.dirname(path),  stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
            except PermissionError as e:
                logger.error('Could not set permissions on %s to allow cleanup of %s: %s', os.path.dirname(path), path, e)
        shutil.rmtree(localWorkerTempDir, onerror=make_parent_writable)

    # This must happen after the log file is done with, else there is no place to put the log
    if (not jobAttemptFailed) and jobDesc.is_subtree_done():
        # We can now safely get rid of the JobDescription, and all jobs it chained up
        for otherID in jobDesc.jobsToDelete:
            jobStore.delete_job(otherID)
        jobStore.delete_job(str(jobDesc.jobStoreID))

    if jobAttemptFailed:
        return failure_exit_code
    else:
        return 0

[docs]def parse_args(args: List[str]) -> argparse.Namespace:
    """
    Parse command-line arguments to the worker.
    """

    # Drop the program name
    args = args[1:]

    # Make the parser
    parser = argparse.ArgumentParser()

    # Now add all the options to it

    # Base required job information
    parser.add_argument("jobName", type=str,
        help="Text name of the job being run")
    parser.add_argument("jobStoreLocator", type=str,
        help="Information required to connect to the job store")
    parser.add_argument("jobStoreID", type=str,
        help="ID of the job within the job store")

    # Additional worker abilities
    parser.add_argument("--context", default=[], action="append",
        help="""Pickled, base64-encoded context manager(s) to run job inside of.
                Allows the Toil leader to pass setup and cleanup work provided by the
                batch system, in the form of pickled Python context manager objects,
                that the worker can then run before/after the job on the batch
                system's behalf.""")

    return parser.parse_args(args)


[docs]@contextmanager
def in_contexts(contexts: List[str]) -> Iterator[None]:
    """
    Unpickle and enter all the pickled, base64-encoded context managers in the
    given list. Then do the body, then leave them all.
    """

    if len(contexts) == 0:
        # Base case: nothing to do
        yield
    else:
        first = contexts[0]
        rest = contexts[1:]

        try:
            manager = pickle.loads(base64.b64decode(first.encode('utf-8')))
        except:
            exc_info = sys.exc_info()
            logger.error('Exception while unpickling context manager: ', exc_info=exc_info)
            raise

        with manager:
            # After entering this first context manager, do the rest.
            # It might set up stuff so we can decode later ones.
            with in_contexts(rest):
                yield


[docs]def main(argv: Optional[List[str]] = None) -> None:
    if argv is None:
        argv = sys.argv

    # Parse our command line
    options = parse_args(argv)

    # Parse input args
    jobName = argv[1]
    jobStoreLocator = argv[2]
    jobStoreID = argv[3]

    ##########################################
    #Load the jobStore/config file
    ##########################################

    jobStore = Toil.resumeJobStore(options.jobStoreLocator)
    config = jobStore.config

    with in_contexts(options.context):
        # Call the worker
        exit_code = workerScript(jobStore, config, options.jobName, options.jobStoreID)

    # Exit with its return value
    sys.exit(exit_code)
Source code for toil.worker

Toil

Navigation

Related Topics