""" NCBI GeneInformation module """
import json
import sqlite3
import contextlib
from typing import Dict, List, Tuple, Optional
from Orange.data import Table, Domain, StringVariable
from Orange.data.util import get_unique_names_domain
from orangecontrib.bioinformatics.utils import serverfiles
from orangecontrib.bioinformatics.ncbi.taxonomy import species_name_to_taxid
from orangecontrib.bioinformatics.ncbi.gene.config import (
DOMAIN,
ENTREZ_ID,
query,
query_exact,
gene_info_attributes,
)
from orangecontrib.bioinformatics.widgets.utils.data import TableAnnotation
[docs]class Gene:
"""Representation of gene summary."""
__slots__ = gene_info_attributes + ('input_identifier',)
[docs] def __init__(self, input_identifier: Optional[str] = None):
"""
If we want to match gene to it's corresponding Entrez ID we must,
upon class initialization, provide some `input identifier`. This way
:class:`GeneMatcher` will know what to match it against in Gene Database.
Parameters
----------
input_identifier : str
This can be any of the following: symbol, synonym, locus tag, other database id, ...
"""
self.input_identifier = input_identifier
def __getattr__(self, attribute):
if attribute not in self.__slots__:
return None
def __repr__(self):
return (
f'<Gene symbol={self.symbol}, tax_id={self.tax_id}, gene_id={self.gene_id}>'
)
def load_attributes(
self,
values: Tuple[str, ...],
attributes: Tuple[str, ...] = gene_info_attributes,
):
for attr, val in zip(attributes, values):
setattr(
self,
attr,
json.loads(val) if attr in ('synonyms', 'db_refs', 'homologs') else val,
)
[docs] def homolog_gene(self, taxonomy_id: str) -> Optional[str]:
"""Returns gene homolog for given organism.
Parameters
----------
taxonomy_id: str
Taxonomy id of target organism.
Returns
-------
str
Entrez ID (if available).
"""
return self.homologs.get(taxonomy_id, None)
[docs]class GeneMatcher:
"""Gene name matching interface."""
[docs] def __init__(self, tax_id: str, progress_callback=None, auto_start=True):
"""
Parameters
----------
tax_id:: str
Taxonomy id of target organism.
"""
self._tax_id: str = tax_id
self._genes: List[Gene] = []
self._progress_callback = progress_callback
self._auto_start = auto_start
self.gene_db_path = self._gene_db_path()
@property
def tax_id(self):
return self._tax_id
@tax_id.setter
def tax_id(self, tax_id: str) -> None:
self._tax_id = tax_id
self.gene_db_path = self._gene_db_path()
@property
def genes(self) -> List[Gene]:
return self._genes
@genes.setter
def genes(self, genes: List[str]) -> None:
self._genes = [Gene(input_identifier=gene) for gene in genes]
if self._auto_start:
self._match()
[docs] def get_known_genes(self) -> List[Gene]:
"""Return Genes with known Entrez ID
Returns
-------
:class:`list` of :class:`Gene` instances
Genes with unique match
"""
return [gene for gene in self.genes if gene.gene_id]
[docs] def to_data_table(self, selected_genes: Optional[List[str]] = None) -> Table:
"""Transform GeneMatcher results to Orange data table.
Optionally we can provide a list of genes (Entrez Ids).
The table on the output will be populated only with provided genes.
Parameters
----------
selected_genes: list
List of Entrez Ids
Returns
-------
Orange.data.Table
Summary of Gene info in tabular format
"""
data_x = []
metas = [
StringVariable('Input gene ID'),
StringVariable(ENTREZ_ID),
StringVariable('Symbol'),
StringVariable('Synonyms'),
StringVariable('Description'),
StringVariable('Other IDs'),
StringVariable('Type of gene'),
StringVariable('Chromosome'),
StringVariable('Map location'),
StringVariable('Locus tag'),
StringVariable('Symbol from nomenclature authority'),
StringVariable('Full name from nomenclature authority'),
StringVariable('Nomenclature status'),
StringVariable('Other designations'),
StringVariable('Species'),
StringVariable('Taxonomy ID'),
]
domain = Domain([], metas=metas)
genes: List[Gene] = self.genes
if selected_genes is not None:
selected_genes_set = set(selected_genes)
genes = [
gene for gene in self.genes if str(gene.gene_id) in selected_genes_set
]
for gene in genes:
db_refs = (
', '.join(
'{}: {}'.format(key, val) for (key, val) in gene.db_refs.items()
)
if gene.db_refs
else ''
)
synonyms = ', '.join(gene.synonyms) if gene.synonyms else ''
line = [
gene.input_identifier,
gene.gene_id,
gene.symbol,
synonyms,
gene.description,
db_refs,
gene.type_of_gene,
gene.chromosome,
gene.map_location,
gene.locus_tag,
gene.symbol_from_nomenclature_authority,
gene.full_name_from_nomenclature_authority,
gene.nomenclature_status,
gene.other_designations,
species_name_to_taxid(gene.species),
gene.tax_id,
]
data_x.append(line)
table = Table.from_list(domain, data_x)
table.name = 'Gene Matcher Results'
table.attributes[TableAnnotation.tax_id] = self.tax_id
table.attributes[TableAnnotation.gene_as_attr_name] = False
table.attributes[TableAnnotation.gene_id_column] = ENTREZ_ID
return table
[docs] def match_table_column(
self,
data_table: Table,
column_name: str,
target_column: Optional[StringVariable] = None,
) -> Table:
"""Helper function for gene name matching with :class:`Orange.data.Table`.
Give a column of genes, GeneMatcher will try to map genes to their
corresponding Entrez Ids.
Parameters
----------
data_table: :class:`Orange.data.Table`
Data table
column_name: str
Name of the column where gene symbols are located
target_column: :class:`StringVariable`
Column where we store Entrez Ids.
Defaults to StringVariable(ncbi.gene.config.NCBI_ID)
Returns
-------
:class:`Orange.data.Table`
Data table with a column of Gene Ids
"""
if column_name in data_table.domain:
self.genes = data_table.get_column_view(column_name)[0]
if target_column is None:
target_column = StringVariable(ENTREZ_ID)
new_domain = Domain(
data_table.domain.attributes,
data_table.domain.class_vars,
data_table.domain.metas + (target_column,),
)
new_data = data_table.transform(new_domain)
with new_data.unlocked(new_data.metas):
new_data[:, target_column] = [
[str(gene.gene_id) if gene.gene_id else '?'] for gene in self.genes
]
return new_data
[docs] def match_table_attributes(
self, data_table, run=True, rename=False, source_name='Source ID'
) -> Table:
"""Helper function for gene name matching with :class:`Orange.data.Table`.
Match table attributes and if a unique match is found create a
new column attribute for Entrez Id. Attribute name is defined
here: `orangecontrib.bioinformatics.ncbi.gene.config.NCBI_ID`
Parameters
----------
data_table: :class:`Orange.data.Table`
Data table
Returns
-------
:class:`Orange.data.Table`
Data table column attributes are populated with Entrez Ids
"""
# run gene matcher
if run:
self.genes = [var.name for var in data_table.domain.attributes]
def helper(gene, attribute):
if gene.gene_id:
if rename:
attribute = attribute.renamed(gene.symbol)
attribute.attributes[source_name] = gene.input_identifier
attribute.attributes[ENTREZ_ID] = gene.gene_id
return attribute
attributes = [
helper(gene, attr)
for gene, attr in zip(self.genes, data_table.domain.attributes)
]
metas = data_table.domain.metas
(attr_deduplicated, _, metas_deduplicated), renamed = get_unique_names_domain(
[a.name for a in attributes], metas=[m.name for m in metas]
)
if len(renamed):
attributes = [
attr.renamed(new_name)
for attr, new_name in zip(attributes, attr_deduplicated)
]
metas = [
meta.renamed(new_name)
for meta, new_name in zip(metas, metas_deduplicated)
]
domain = Domain(attributes, data_table.domain.class_vars, metas)
return data_table.transform(domain)
def match_genes(self):
self._match()
def _gene_db_path(self):
return serverfiles.localpath_download(DOMAIN, f'{self.tax_id}.sqlite')
def _match(self):
synonyms, db_refs = 4, 5
with contextlib.closing(sqlite3.connect(self.gene_db_path)) as con:
with con as cursor:
for gene in self.genes:
if self._progress_callback:
self._progress_callback()
search_param = gene.input_identifier.lower()
if search_param:
match_statement = (
'{gene_id symbol locus_tag symbol_from_nomenclature_authority}:^"'
+ search_param
+ '"'
)
match = cursor.execute(
query_exact, (match_statement,) + tuple([search_param] * 4)
).fetchall()
# if unique match
if len(match) == 1:
gene.load_attributes(match[0])
continue
match = cursor.execute(
query, (f'synonyms:"{search_param}"',)
).fetchall()
synonym_matched_rows = [
m
for m in match
if search_param
in (x.lower() for x in json.loads(m[synonyms]))
]
# if unique match
if len(synonym_matched_rows) == 1:
gene.load_attributes(synonym_matched_rows[0])
continue
match = cursor.execute(
query, (f'db_refs:"{search_param}"',)
).fetchall()
db_ref_matched_rows = [
m
for m in match
if search_param
in (x.lower() for x in json.loads(m[db_refs]).values())
]
# if unique match
if len(db_ref_matched_rows) == 1:
gene.load_attributes(db_ref_matched_rows[0])
continue
[docs]class GeneInfo(dict):
[docs] def __init__(self, tax_id: str):
"""Loads genes for given organism in a dict.
Each instance of :class:`Gene` is mapped to corresponding Entrez ID
Parameters
----------
tax_id: str
Taxonomy id of target organism.
"""
super().__init__()
self.tax_id: str = tax_id
self.gene_db_path: str = self._gene_db_path()
connection = sqlite3.connect(self.gene_db_path)
cursor = connection.cursor()
for gene_info in cursor.execute('SELECT * FROM gene_info').fetchall():
gene = Gene()
gene.load_attributes(gene_info)
self[gene.gene_id] = gene
cursor.close()
connection.close()
def _gene_db_path(self):
return serverfiles.localpath_download(DOMAIN, f'{self.tax_id}.sqlite')
def load_gene_summary(tax_d: str, genes: List[Optional[str]]) -> List[Optional[Gene]]:
gene_db_path = serverfiles.localpath_download(DOMAIN, f'{tax_d}.sqlite')
# filter NoneTypes
_genes = [g for g in genes if g]
with contextlib.closing(sqlite3.connect(gene_db_path)) as con:
with con as cur:
gene_map: Dict[str, Gene] = {}
for gene_info in cur.execute(
f'SELECT * FROM gene_info WHERE gene_id in ({",".join(_genes)})'
).fetchall():
gene = Gene()
gene.load_attributes(gene_info)
gene_map[gene.gene_id] = gene
return [gene_map.get(gid, None) if gid else None for gid in genes]
if __name__ == "__main__":
gm = GeneMatcher('9606')
gm.genes = ['CD4', '614535', 'ENSG00000205426', "2'-PDE", 'HB-1Y']
print(list(zip(gm.genes, [g.input_identifier for g in gm.genes])))
_homologs = load_gene_summary(
'10090', [g.homolog_gene(taxonomy_id='10090') for g in gm.genes]
)
print(_homologs)