Source code for cagecleaner.genome_run

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from cagecleaner.run import Run
from cagecleaner.utils import run_command

import logging


LOG = logging.getLogger(__name__)


[docs] class GenomeRun(Run): """ Abstract intermediary class grouping the methods shared by every run involving whole-genome dereplication. Inherits from: Run: Base class providing argument parsing, hit recovery, session filtering and output generation functionalities See Also: LocalGenomeRun: Full-genome dereplication for hits in local sequences. RemoteGenomeRun: Full-genome dereplication for hits in remote sequences. """ def __init__(self, args): """ Initialise a GenomeRun instance. Runs the base class init. Args: args (argparse.Namespace): Parsed command-line arguments Returns: None """ super().__init__(args) return None
[docs] def dereplicate_genomes(self): """ Dereplicate the gathered genome files using whole-genome ANI similarity with skDER. Sets the dereplication input directory to the full genome folder, and runs the skDER dereplication command. skDER output is stored in TEMP_DIR/dereplication. Returns: None Raises: RuntimeError: If the input folder is empty or does not exist, or if the skDER command run fails. """ self.DEREP_IN_DIR = self.TEMP_GENOME_DIR LOG.info(f'Dereplicating genomes in {str(self.DEREP_IN_DIR)} with identity cutoff of {str(self.identity)} % and coverage cutoff of {str(self.coverage)} %') if not(self.DEREP_IN_DIR.exists()): msg = "Dereplication input directory does not exist!" LOG.critical(msg) raise RuntimeError(msg) try: next(self.DEREP_IN_DIR.iterdir()) except StopIteration: msg = "The dereplication input folder is empty!" LOG.critical(msg) raise RuntimeError(msg) LOG.info("Starting skDER") cmd = ['skder', '-g', str(self.DEREP_IN_DIR), '-o', str(self.DEREP_OUT_DIR), '-i', str(self.identity), '-f', str(self.coverage), '-c', str(self.cores), '-d', "low_mem_"*self.low_mem + 'greedy', '-n' ] try: run_command(cmd) except RuntimeError: msg = "Dereplicating genomes with skDER failed!" LOG.critical(msg) raise RuntimeError(msg) LOG.info("Dereplication done!") extensions = {'.fna','.fa','.fasta','.fna.gz','.fa.gz','.fasta.gz'} paths = [str(p) for p in self.DEREP_IN_DIR.iterdir() if extensions & set(p.suffixes)] before = len(paths) after = len(list((self.DEREP_OUT_DIR / 'Dereplicated_Representative_Genomes').iterdir())) LOG.info(f'{before} genomes were reduced to {after} genomes.') return None