# -*- coding: utf-8 -*-
"""Code for representing a VCF record
The VCF record structure is modeled after the one of PyVCF
"""
import re
#: Code for single nucleotide variant allele
SNV = 'SNV'
#: Code for a multi nucleotide variant allele
MNV = 'MNV'
#: Code for "clean" deletion allele
DEL = 'DEL'
#: Code for "clean" insertion allele
INS = 'INS'
#: Code for indel allele, includes substitutions of unequal length
INDEL = 'INDEL'
#: Code for structural variant allele
SV = 'SV'
#: Code for break-end allele
BND = 'BND'
#: Code for symbolic allele that is neither SV nor BND
SYMBOLIC = 'SYMBOLIC'
#: Code for mixed variant type
MIXED = 'MIXED'
#: Codes for structural variants
SV_CODES = ('DEL', 'INS', 'DUP', 'INV', 'CNV')
#: Characters reserved in VCF, have to be escaped
RESERVED_CHARS = ':;=%,\r\n\t'
#: Mapping for escaping reserved characters
ESCAPE_MAPPING = [
('%', '%25'), (':', '%3A'), (';', '%3B'), ('=', '%3D'), (',', '%2C'),
('\r', '%0D'), ('\n', '%0A'), ('\t', '%09')
]
#: Mapping from escaped characters to reserved one
UNESCAPE_MAPPING = [(v, k) for k, v in ESCAPE_MAPPING]
[docs]class Record:
"""Represent one record from the VCF file
Record objects are iterators of their calls
"""
def __init__(self, CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT,
calls):
#: A ``str`` with the chromosome name
self.CHROM = CHROM
#: An ``int`` with a 1-based begin position
self.POS = POS
#: An ``int`` with a 0-based begin position
self.begin = POS - 1
#: An ``int`` with a 0-based end position
self.end = None # XXX
#: A list of the semicolon-separated values of the ID column
self.ID = list(ID)
#: A ``str`` with the REF value
self.REF = REF
#: A list of alternative allele records of type :py:class:`AltRecord`
self.ALT = list(ALT)
#: The quality value, can be ``None``
self.QUAL = QUAL
#: A list of strings for the FILTER column
self.FILTER = FILTER
#: An OrderedDict giving the values of the INFO column, flags are
#: mapped to ``True``
self.INFO = INFO
#: A list of strings for the FORMAT column
self.FORMAT = FORMAT
#: A list of genotype :py:class:`Call` objects
self.calls = list(calls)
for call in self.calls:
call.site = self
#: A mapping from sample name to entry in self.calls
self.call_for_sample = {call.sample: call for call in self.calls}
[docs] def is_snv(self):
"""Return ``True`` if it is a SNV"""
return (len(self.REF) == 1 and all(a.type == 'SNV' for a in self.ALT))
@property
def affected_start(self):
"""Return affected start position in 0-based coordinates
For SNVs, MNVs, and deletions, the behaviour is the start position.
In the case of insertions, the position behind the insert position is
returned, yielding a 0-length interval together with
:py:method:`affected_end`
"""
types = {alt.type for alt in self.ALT} # set!
BAD_MIX = {INS, SV, BND, SYMBOLIC} # don't mix well with others
if (BAD_MIX & types) and len(types) == 1 and list(types)[0] == INS:
# Only insertions, return 0-based position right of first base
return self.POS # right of first base
else: # Return 0-based start position of first REF base
return (self.POS - 1) # left of first base
@property
def affected_end(self):
"""Return affected start position in 0-based coordinates
For SNVs, MNVs, and deletions, the behaviour is based on the start
position and the length of the REF. In the case of insertions, the
position behind the insert position is returned, yielding a 0-length
interval together with :py:method:`affected_start`
"""
types = {alt.type for alt in self.ALT} # set!
BAD_MIX = {INS, SV, BND, SYMBOLIC} # don't mix well with others
if (BAD_MIX & types) and len(types) == 1 and list(types)[0] == INS:
# Only insertions, return 0-based position right of first base
return self.POS # right of first base
else: # Return 0-based end position, behind last REF base
return (self.POS - 1) + len(self.REF)
[docs] def add_filter(self, label):
"""Add label to FILTER if not set yet"""
if label not in self.FILTER:
self.FILTER.append(label)
def __iter__(self):
"""Return generator yielding from ``self.calls``"""
yield from self.calls
def __str__(self):
tpl = 'Record({})'
lst = [self.CHROM, self.POS, self.ID, self.REF, self.ALT, self.QUAL,
self.FILTER, self.INFO, self.FORMAT, self.calls]
return tpl.format(', '.join(map(repr, lst)))
def __repr__(self):
return str(self)
ALLELE_DELIM = re.compile(r'[|/]')
[docs]class Call:
"""The information for a genotype callable
By VCF, this should always include the genotype information and
can contain an arbitrary number of further annotation, e.g., the
coverage at the variant position.
"""
def __init__(self, sample, data, site=None):
#: the name of the sample for which the call was made
self.sample = sample
#: an OrderedDict with the key/value pair information from the
#: call's data
self.data = data
#: the :py:class:`Record` of this :py:class:`Call`
self.site = site
#: the allele numbers (0, 1, ...) in this calls or None for no-call
self.gt_alleles = None
#: whether or not the variant is fully called
self.called = None
#: the number of alleles in this sample's call
self.plodity = None
if self.data.get('GT', None) is not None:
self.gt_alleles = []
for allele in ALLELE_DELIM.split(self.data['GT']):
if allele == '.':
self.gt_alleles.append(allele)
else:
self.gt_alleles.append(int(allele))
self.called = all([al is not None for al in self.gt_alleles])
self.ploidty = len(self.gt_alleles)
@property
def phased(self):
"""Return boolean indicating whether this call is phased"""
return '|' in self.data.get('GT', '')
[docs] def gt_phase_char(self):
"""Return character to use for phasing"""
return '/' if not self.phased else '|'
@property
def gt_bases(self):
"""Return the actual genotype alleles, e.g. if VCF genotype is 0/1,
could return A/T"""
result = []
for a in self.gt_alleles:
if a is None:
result.append(None)
elif a == 0:
result.append(self.site.REF)
else:
result.append(self.site.ALT[a - 1])
return result
@property
def gt_type(self):
"""The type of genotype, mapping is
- hom_ref = 0
- het = 1
- hom_alt = 2 (which alt is untracked)
- uncalled = ``None``
"""
if not self.called:
return None # not called
elif all(a == 0 for a in self.gt_alleles):
return 0 # hom ref
elif len(set(self.gt_alleles)) == 0:
return 2 # hom alt
else:
return 1 # heterozygous
@property
def is_filtered(self):
"""Return ``True`` for filtered calls"""
raise NotImplementedError('Implement me!')
@property
def is_het(self):
"""Return ``True`` for filtered calls"""
raise NotImplementedError('Implement me!')
@property
def is_variant(self):
"""Return ``True`` for filtered calls"""
raise NotImplementedError('Implement me!')
@property
def is_phased(self):
"""Return ``True`` for phased calls"""
raise NotImplementedError('Implement me!')
def __str__(self):
tpl = 'Call({})'
lst = [self.sample, self.data]
return tpl.format(', '.join(map(repr, lst)))
def __repr__(self):
return str(self)
[docs]class AltRecord:
"""An alternative allele Record
Currently, can be a substitution, an SV placeholder, or breakend
"""
def __init__(self, type_=None):
#: String describing the type of the variant, could be one of
#: SNV, MNV, could be any of teh types described in the ALT
#: header lines, such as DUP, DEL, INS, ...
self.type = type_
[docs]class Substitution(AltRecord):
"""A basic alternative allele record describing a REF->AltRecord
substitution
Note that this subsumes MNVs, insertions, and deletions.
"""
def __init__(self, type_, value):
super().__init__(type_)
#: The alternative base sequence to use in the substitution
self.value = value
def __str__(self):
tpl = 'Substitution(type_={}, value={})'
return tpl.format(*map(repr, [self.type, self.value]))
def __repr__(self):
return str(self)
[docs]class SV(AltRecord):
"""A placeholder for an SV
For SVs, simply the ``type`` is written out as ``"<type>"`` as alternative
"""
def __init__(self, type_, value):
super().__init__(type_)
#: The alternative base sequence to use in the substitution
self.value = value
def __str__(self):
tpl = 'Substitution(type_={}, value={})'
return tpl.format(*map(repr, [self.type, self.value]))
def __repr__(self):
return str(self)
[docs]class BreakEnd(AltRecord):
"""A placeholder for a breakend"""
def __init__(self, type_, value):
super().__init__(type_)
#: The alternative base sequence to use in the substitution
self.value = value
def __str__(self):
tpl = 'Substitution(type_={}, value={})'
return tpl.format(*map(repr, [self.type, self.value]))
def __repr__(self):
return str(self)
[docs]class SingleBreakEnd(AltRecord):
"""A placeholder for a single breakend"""
def __init__(self, type_, value):
super().__init__(type_)
#: The alternative base sequence to use in the substitution
self.value = value
def __str__(self):
tpl = 'Substitution(type_={}, value={})'
return tpl.format(*map(repr, [self.type, self.value]))
def __repr__(self):
return str(self)
[docs]class SymbolicAllele(AltRecord):
"""A placeholder for a symbolic allele"""
def __init__(self, type_, value):
super().__init__(type_)
#: The alternative base sequence to use in the substitution
self.value = value
def __str__(self):
tpl = 'Substitution(type_={}, value={})'
return tpl.format(*map(repr, [self.type, self.value]))
def __repr__(self):
return str(self)