#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from cagecleaner.run import Run
from cagecleaner.utils import run_command
import logging
from pathlib import Path
LOG = logging.getLogger(__name__)
[docs]
class RegionRun(Run):
"""
Abstract intermediary class grouping the methods shared by every run involving region-based dereplication.
Inherits from:
Run: Base class providing argument parsing, hit recovery, session filtering and output generation functionalities
See Also:
LocalRegionRun: Region-based dereplication for hits in local sequences.
RemoteRegionRun: Region-based dereplication for hits in remote sequences.
"""
def __init__(self, parsed_args):
"""
Initialise a RegionRun instance.
Runs the base class init and checks for a valid identity threshold and sequence margin.
Args:
parsed_args (dict): Parsed and validated command-line arguments
Raises:
ValueError: If identity threshold is not a percentage value, or if sequence margin is not a positive number.
Returns:
None
"""
super().__init__(parsed_args)
self.DEREP_IN_DIR: Path = self.TEMP_DIR / 'regions' # Path where the genomic regions will be saved temporarily for region-based dereplication
self.DEREP_IN_DIR.mkdir(parents = True)
return None
[docs]
def dereplicate_regions(self):
"""
This method takes the path to a genomic regions folder and dereplicates them using MMseqs2.
MMseqs2 output is stored in TEMP_DIR/derep_out.
Dereplicate the gathered genome files using whole-genome ANI similarity with skDER.
Sets the dereplication input directory to the full genome folder, and runs the skDER dereplication command.
skDER output is stored in TEMP_DIR/dereplication.
Returns:
None
Raises:
RuntimeError: If the MMseqs2 command run fails, or if the input folder is empty or does not exist.
"""
mmseqs_verbosity = str(min(self.verbosity, 3))
self.DEREP_OUT_DIR.mkdir(parents = True, exist_ok = True)
if not(self.DEREP_IN_DIR.exists()):
msg = "The dereplication input folder does not exist!"
LOG.critical(msg)
raise RuntimeError(msg)
try:
next(self.DEREP_IN_DIR.iterdir())
except StopIteration:
msg = "The dereplication input folder is empty!"
LOG.critical(msg)
raise RuntimeError(msg)
cmd = ['mmseqs', 'easy-cluster',
*[str(p) for p in self.DEREP_IN_DIR.iterdir()],
str(self.DEREP_OUT_DIR / 'derep'),
str(self.DEREP_OUT_DIR / 'tmp'),
'--min-seq-id', str(self.identity/100),
'-c', str(self.coverage/100),
'--threads', str(self.cores),
'-v', mmseqs_verbosity
]
try:
run_command(cmd)
except RuntimeError:
msg = 'Dereplicating regions with MMseqs2 failed!'
LOG.critical(msg)
raise RuntimeError(msg)
return None