from katlas.data import *
from katlas.utils import *
import pandas as pd
from fastcore.all import L
import re
'display.max_rows', 20)
pd.set_option('display.max_columns', 100) pd.set_option(
Get all kinase domain sequences
Setup
Kinase domain info of current human kinome
= Data.get_kinase_info() df
sum() df.kinasecom_domain.notna().
np.int64(516)
# df.uniprot.to_csv('kinase_uniprot.csv')
Query the uniprot id in ID mapping in uniprot. Add domain [FT]
and region
in the columns
=pd.read_excel('raw/idmapping_kinase_info_2025_05_27.xlsx') uniprot
'Domain [FT]'].head() uniprot[
0 DOMAIN 46..315; /note="Protein kinase"; /evide...
1 DOMAIN 61..121; /note="SH3"; /evidence="ECO:00...
2 DOMAIN 107..167; /note="SH3"; /evidence="ECO:0...
3 DOMAIN 126..385; /note="Protein kinase"; /evid...
4 DOMAIN 192..485; /note="Protein kinase"; /evid...
Name: Domain [FT], dtype: object
FAM20C does not have Domain[FT]
but contains Region to indicate kinase domain
'Domain [FT]'].isna()].Region.tolist() uniprot[uniprot[
['REGION 62..81; /note="Disordered"; /evidence="ECO:0000256|SAM:MobiDB-lite"; REGION 94..159; /note="Disordered"; /evidence="ECO:0000256|SAM:MobiDB-lite"; REGION 354..565; /note="Kinase domain"; /evidence="ECO:0000305|PubMed:22582013"']
= uniprot.dropna(subset='Domain [FT]') uniprot
def extract_kinase_domains1(text):
"""
Extracts domain or region annotations from text using regex.
Includes entries where the note contains either 'kinase' or 'PI3K/PI4K catalytic'.
Returns a list of [note, region, evidence] where evidence is set to 'nan' if not present.
"""
= r'(?:REGION|DOMAIN) [<>]?(\d+)\.\.[<>]?(\d+); /note="([^"]*?(?:kinase|PI3K/PI4K catalytic)[^"]*?)"(?:; /evidence="([^"]*?)")?'
pattern = re.findall(pattern, text, flags=re.IGNORECASE)
matches
return [[note.strip(), evidence if evidence else 'nan']
for start, end, note, evidence in matches]
= uniprot['Domain [FT]'].apply(extract_kinase_domains1) out
The rest of the kinases have domain info:
out.explode().value_counts()
Domain [FT]
[Protein kinase, ECO:0000255|PROSITE-ProRule:PRU00159] 454
[AGC-kinase C-terminal, ECO:0000255|PROSITE-ProRule:PRU00618] 55
[Protein kinase 2, ECO:0000255|PROSITE-ProRule:PRU00159] 14
[Protein kinase 1, ECO:0000255|PROSITE-ProRule:PRU00159] 14
[Protein kinase, nan] 7
[Alpha-type protein kinase, ECO:0000255|PROSITE-ProRule:PRU00501] 6
[PI3K/PI4K catalytic, ECO:0000255|PROSITE-ProRule:PRU00269] 6
[Histidine kinase, ECO:0000255|PROSITE-ProRule:PRU00107] 4
[Protein kinase, ECO:0000255|PROSITE-ProRule:PRU00159, ECO:0000305] 4
[Guanylate kinase-like, ECO:0000255|PROSITE-ProRule:PRU00100] 1
[Protein kinase; inactive, ECO:0000255|PROSITE-ProRule:PRU00159] 1
[Protein kinase, ECO:0000255|PROSITE-ProRule:PRU00159, ECO:0000269|PubMed:15194684] 1
[Protein kinase, ECO:0000255|PROSITE-ProRule:PRU00159, ECO:0000269|PubMed:9092543] 1
[Protein kinase, ECO:0000250|UniProtKB:Q5VT25, ECO:0000255|PROSITE-ProRule:PRU00159] 1
[Protein kinase, ECO:0000255|PROSITE-ProRule:PRU00159, ECO:0000312|EMBL:CAA84485.1] 1
[Protein kinase, ECO:0000305] 1
Name: count, dtype: int64
We’ll search uniprot across species based on the info above to get all kinases and their kinase domains.
Download kinase domain info from Uniprot
Uniprot reviewed –> search advanced
–> domain: “Protein kinase”, or “PI3K PI4K catalytic” or “Histidine kinase”
or –> region: “Kinase domain”
=pd.read_excel('raw/uniprotkb_ft_domain_Protein_kinase_AND_2025_05_25.xlsx')
kd1=pd.read_excel('raw/uniprotkb_ft_domain_PI3K_PI4K_catalytic_2025_05_25.xlsx')
kd2=pd.read_excel('raw/uniprotkb_ft_domain_Histidine_kinase_AN_2025_05_25.xlsx')
kd3=pd.read_excel('raw/uniprotkb_ft_region_Kinase_domain_2025_05_26.xlsx') kd4
= pd.concat([kd1,kd2,kd3,kd4],ignore_index=True) kd_all
= kd_all[kd_all.Organism.str.contains('Homo')] kd_human
Check if how it overlap with current kinome
kinase domain query vs. kinome tree kinases
= Data.get_kinase_info() kinase_info
= get_diff(kd_human,kinase_info,'Uniprot','uniprot') a,b
5] # special for kinome tree b.iloc[:,:
kinase | ID_coral | uniprot | gene | modi_group | |
---|---|---|---|---|---|
41 | BRD2 | BRD2 | P25440 | BRD2 | Atypical |
42 | BRD3 | BRD3 | Q15059 | BRD3 | Atypical |
43 | BRD4 | BRD4 | O60885 | BRD4 | Atypical |
44 | BRDT | BRDT | Q58F21 | BRDT | Atypical |
486 | TRIM24 | TIF1a | O15164 | TRIM24 | Atypical |
487 | TRIM28 | TIF1b | Q13263 | TRIM28 | Atypical |
488 | TRIM33 | TIF1g | Q9UPN9 | TRIM33 | Atypical |
These kinases does not have clear boundary of kinase domain in the domain[FT] column.
5] # special for kinase domain query a.iloc[:,:
Uniprot | Reviewed | Entry Name | Protein names | Gene Names | |
---|---|---|---|---|---|
895 | P21675 | reviewed | TAF1_HUMAN | Transcription initiation factor TFIID subunit ... | TAF1 BA2R CCG1 CCGS TAF2A |
1982 | Q496M5 | reviewed | PLK5_HUMAN | Inactive serine/threonine-protein kinase PLK5 ... | PLK5 PLK5P FG060302 |
2448 | Q6A1A2 | reviewed | PDPK2_HUMAN | Putative 3-phosphoinositide-dependent protein ... | PDPK2P PDPK2 |
2894 | Q8NEV1 | reviewed | CSK23_HUMAN | Casein kinase II subunit alpha 3 (CK II alpha ... | CSNK2A3 CSNK2A1P |
3624 | Q9UQ88 | reviewed | CD11A_HUMAN | Cyclin-dependent kinase 11A (EC 2.7.11.22) (Ce... | CDK11A CDC2L2 CDC2L3 PITSLREB |
... | ... | ... | ... | ... | ... |
4761 | Q9UBF8 | reviewed | PI4KB_HUMAN | Phosphatidylinositol 4-kinase beta (PI4K-beta)... | PI4KB PIK4CB |
4790 | A4QPH2 | reviewed | PI4P2_HUMAN | Putative phosphatidylinositol 4-kinase alpha-l... | PI4KAP2 |
4812 | A4D2B8 | reviewed | PM2P1_HUMAN | Putative postmeiotic segregation increased 2-l... | PMS2P1 PMS2L1 PMS2L13 PMS2L6 PMS2L8 PMS3 PMS8 ... |
4878 | Q15119 | reviewed | PDK2_HUMAN | [Pyruvate dehydrogenase (acetyl-transferring)]... | PDK2 PDHK2 |
5441 | Q8N159 | reviewed | NAGS_HUMAN | N-acetylglutamate synthase, mitochondrial (EC ... | NAGS |
21 rows × 5 columns
Kinase keyword query contains a lot of non-serine/thronine/tyrosine kinases.
kinase domain query vs. kinase keyword query
= pd.read_excel('raw/uniprot_human_keyword_kinase.xlsx').rename(columns={'Entry':'Uniprot'}) all_kinase
= get_diff(kd_human,all_kinase,'Uniprot') a,b
5] # special for kd domain query a.iloc[:,:
Uniprot | Reviewed | Entry Name | Protein names | Gene Names | |
---|---|---|---|---|---|
1982 | Q496M5 | reviewed | PLK5_HUMAN | Inactive serine/threonine-protein kinase PLK5 ... | PLK5 PLK5P FG060302 |
5441 | Q8N159 | reviewed | NAGS_HUMAN | N-acetylglutamate synthase, mitochondrial (EC ... | NAGS |
kd domain query also include inactive kinase (PLK5), which is not marked with ‘kinase’ in the keywords.
Another one is NAGS, which contains Amino-acid kinase domain (AAK), and not marked with ‘kinase’ in the keywords.
5] # special for kinase keyword query b.iloc[:,:
Uniprot | Entry Name | Protein names | Gene Names | uniprot_keyword_kinase | |
---|---|---|---|---|---|
0 | A2RU49 | HYKK_HUMAN | Hydroxylysine kinase (5-hydroxy-L-lysine kinas... | HYKK AGPHD1 | 1 |
3 | O00142 | KITM_HUMAN | Thymidine kinase 2, mitochondrial (EC 2.7.1.21... | TK2 | 1 |
11 | O00746 | NDKM_HUMAN | Nucleoside diphosphate kinase, mitochondrial (... | NME4 NM23D | 1 |
13 | O00764 | PDXK_HUMAN | Pyridoxal kinase (EC 2.7.1.35) (Pyridoxine kin... | PDXK C21orf124 C21orf97 PKH PNK PRED79 | 1 |
23 | O14986 | PI51B_HUMAN | Phosphatidylinositol 4-phosphate 5-kinase type... | PIP5K1B STM7 | 1 |
... | ... | ... | ... | ... | ... |
638 | O60885 | BRD4_HUMAN | Bromodomain-containing protein 4 (Protein HUNK1) | BRD4 HUNK1 | 0 |
639 | Q58F21 | BRDT_HUMAN | Bromodomain testis-specific protein (Cancer/te... | BRDT | 0 |
663 | O15164 | TIF1A_HUMAN | Transcription intermediary factor 1-alpha (TIF... | TRIM24 RNF82 TIF1 TIF1A | 0 |
664 | Q13263 | TIF1B_HUMAN | Transcription intermediary factor 1-beta (TIF1... | TRIM28 KAP1 RNF96 TIF1B | 0 |
665 | Q9UPN9 | TRI33_HUMAN | E3 ubiquitin-protein ligase TRIM33 (EC 2.3.2.2... | TRIM33 KIAA1113 RFG7 TIF1G | 0 |
151 rows × 5 columns
Kinase keyword query contains a lot of non-serine/thronine/tyrosine kinases.
Extract kinase domain info
Get residue start, end, and evidence info.
def extract_kinase_domains(text):
"""
Extracts domain or region annotations from text using regex.
Includes entries where the note contains either 'kinase' or 'PI3K/PI4K catalytic'.
Returns a list of [note, region, evidence] where evidence is set to 'nan' if not present.
"""
= r'(?:REGION|DOMAIN) [<>]?(\d+)\.\.[<>]?(\d+); /note="([^"]*?(?:kinase|PI3K/PI4K catalytic)[^"]*?)"(?:; /evidence="([^"]*?)")?'
pattern = re.findall(pattern, text, flags=re.IGNORECASE)
matches
return [[note.strip(),int(start),int(end), evidence if evidence else 'nan']
for start, end, note, evidence in matches]
Region column
Specific for kd4, without domain info but with region info
'kd_info'] = kd4['Region'].apply(extract_kinase_domains) kd4[
'kd_info'] kd4[
0 [[Amino-acid kinase domain (AAK), 40, 361, ECO...
1 [[Kinase domain, 349, 560, ECO:0000250|UniProt...
2 [[Kinase domain, 354, 565, ECO:0000305|PubMed:...
3 [[Amino-acid kinase domain (AAK), 19, 376, nan]]
4 [[Amino-acid kinase domain (AAK), 19, 369, ECO...
5 [[HWE histidine kinase domain, 160, 236, nan]]
6 [[HWE histidine kinase domain, 285, 367, nan]]
7 [[HWE histidine kinase domain, 260, 303, nan]]
8 [[HWE histidine kinase domain, 259, 341, nan]]
9 [[HWE histidine kinase domain, 259, 341, nan]]
10 [[HWE histidine kinase domain, 286, 368, nan]]
11 [[HWE histidine kinase domain, 285, 367, nan]]
12 [[HWE histidine kinase domain, 285, 367, nan]]
13 [[HWE histidine kinase domain, 285, 367, nan]]
14 [[HWE histidine kinase domain, 254, 336, nan]]
15 [[HWE histidine kinase domain, 259, 341, nan]]
Name: kd_info, dtype: object
Domain
Most kinases contain domain info for kinase domain.
= pd.concat([kd1,kd2,kd3]) kd
= kd.drop_duplicates('Uniprot').reset_index(drop=True) kd
'kd_info'] = kd['Domain [FT]'].apply(extract_kinase_domains) kd[
'kd_info'].str.len().value_counts() # check non-zero kd[
kd_info
1 5015
2 397
3 22
Name: count, dtype: int64
Some kinase contain 3 kinase domains
'kd_info'].str.len()==3]['Domain [FT]'].tolist()[0] kd[kd[
'DOMAIN 49..318; /note="Protein kinase 1"; /evidence="ECO:0000255|PROSITE-ProRule:PRU00159"; DOMAIN 319..387; /note="AGC-kinase C-terminal"; /evidence="ECO:0000255|PROSITE-ProRule:PRU00618"; DOMAIN 426..687; /note="Protein kinase 2"; /evidence="ECO:0000255|PROSITE-ProRule:PRU00159"'
After check, it seems it includes C-terminal domain. We will later remove them
= pd.concat([kd,kd4]) kd_final
= kd_final.explode('kd_info', ignore_index=True) df
'domain_note', 'domain_start','domain_end', 'domain_evidence']] = df.kd_info.apply(pd.Series) df[[
= df.drop(columns=['kd_info']) df
Remove non kinase domains:
df.columns
Index(['Uniprot', 'Reviewed', 'Entry Name', 'Protein names', 'Gene Names',
'Gene Names (primary)', 'Organism', 'Length', 'Domain [FT]',
'Domain [CC]', 'Motif', 'Protein families', 'Reactome', 'ComplexPortal',
'Subcellular location [CC]', 'Gene Ontology (biological process)',
'Tissue specificity', 'Interacts with', 'Subunit structure',
'Function [CC]', 'Activity regulation', 'Mass', 'Sequence', 'Region',
'domain_note', 'domain_start', 'domain_end', 'domain_evidence'],
dtype='object')
= df[df.domain_note!='AGC-kinase C-terminal'].reset_index(drop=True) df
df.domain_note.value_counts()
domain_note
Protein kinase 4530
Histidine kinase 625
PI3K/PI4K catalytic 168
Protein kinase 2 73
Protein kinase 1 73
Alpha-type protein kinase 22
Protein kinase; inactive 13
HWE histidine kinase domain 11
Guanylate kinase-like 5
Histidine kinase 2 4
Histidine kinase 1 4
Amino-acid kinase domain (AAK) 3
Kinase domain 2
Protein kinase; truncated 1
Histidine kinase; first part 1
Histidine kinase; second part 1
Name: count, dtype: int64
Download uniprot sequence
The big TITIN proteins (A2ASS6 for mouse, and Q8WZ42 for human) is truncated when downloaded online.
from tqdm.contrib.concurrent import thread_map
from kdock.core.protein import get_uniprot_seq
= df.rename(columns={'Sequence':'full_protein_seq'}) df
= thread_map(get_uniprot_seq,df.Uniprot, max_workers=5) full_seq
CPU times: user 8.26 s, sys: 845 ms, total: 9.11 s
Wall time: 6min 2s
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) File ~/git/KATLAS/katlas/.venv/lib/python3.12/site-packages/tqdm/contrib/concurrent.py:51, in _executor_map(PoolExecutor, fn, *iterables, **tqdm_kwargs) 49 with PoolExecutor(max_workers=max_workers, initializer=tqdm_class.set_lock, 50 initargs=(lk,)) as ex: ---> 51 return list(tqdm_class(ex.map(fn, *iterables, chunksize=chunksize), **kwargs)) File ~/git/KATLAS/katlas/.venv/lib/python3.12/site-packages/tqdm/notebook.py:250, in tqdm_notebook.__iter__(self) 249 it = super().__iter__() --> 250 for obj in it: 251 # return super(tqdm...) will not catch exception 252 yield obj File ~/git/KATLAS/katlas/.venv/lib/python3.12/site-packages/tqdm/std.py:1181, in tqdm.__iter__(self) 1180 try: -> 1181 for obj in iterable: 1182 yield obj File /usr/lib/python3.12/concurrent/futures/_base.py:619, in Executor.map.<locals>.result_iterator() 618 if timeout is None: --> 619 yield _result_or_cancel(fs.pop()) 620 else: File /usr/lib/python3.12/concurrent/futures/_base.py:317, in _result_or_cancel(***failed resolving arguments***) 316 try: --> 317 return fut.result(timeout) 318 finally: File /usr/lib/python3.12/concurrent/futures/_base.py:451, in Future.result(self, timeout) 449 return self.__get_result() --> 451 self._condition.wait(timeout) 453 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: File /usr/lib/python3.12/threading.py:355, in Condition.wait(self, timeout) 354 if timeout is None: --> 355 waiter.acquire() 356 gotit = True KeyboardInterrupt: During handling of the above exception, another exception occurred: KeyboardInterrupt Traceback (most recent call last) Cell In[47], line 1 ----> 1 get_ipython().run_cell_magic('time', '', 'full_seq = thread_map(get_uniprot_seq,df.Uniprot, max_workers=5)\n') File ~/git/KATLAS/katlas/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py:2565, in InteractiveShell.run_cell_magic(self, magic_name, line, cell) 2563 with self.builtin_trap: 2564 args = (magic_arg_s, cell) -> 2565 result = fn(*args, **kwargs) 2567 # The code below prevents the output from being displayed 2568 # when using magics with decorator @output_can_be_silenced 2569 # when the last Python token in the expression is a ';'. 2570 if getattr(fn, magic.MAGIC_OUTPUT_CAN_BE_SILENCED, False): File ~/git/KATLAS/katlas/.venv/lib/python3.12/site-packages/IPython/core/magics/execution.py:1470, in ExecutionMagics.time(self, line, cell, local_ns) 1468 if interrupt_occured: 1469 if exit_on_interrupt and captured_exception: -> 1470 raise captured_exception 1471 return 1472 return out File ~/git/KATLAS/katlas/.venv/lib/python3.12/site-packages/IPython/core/magics/execution.py:1434, in ExecutionMagics.time(self, line, cell, local_ns) 1432 st = clock2() 1433 try: -> 1434 exec(code, glob, local_ns) 1435 out = None 1436 # multi-line %%time case File <timed exec>:1 File ~/git/KATLAS/katlas/.venv/lib/python3.12/site-packages/tqdm/contrib/concurrent.py:69, in thread_map(fn, *iterables, **tqdm_kwargs) 55 """ 56 Equivalent of `list(map(fn, *iterables))` 57 driven by `concurrent.futures.ThreadPoolExecutor`. (...) 66 [default: max(32, cpu_count() + 4)]. 67 """ 68 from concurrent.futures import ThreadPoolExecutor ---> 69 return _executor_map(ThreadPoolExecutor, fn, *iterables, **tqdm_kwargs) File ~/git/KATLAS/katlas/.venv/lib/python3.12/site-packages/tqdm/contrib/concurrent.py:49, in _executor_map(PoolExecutor, fn, *iterables, **tqdm_kwargs) 46 lock_name = kwargs.pop("lock_name", "") 47 with ensure_lock(tqdm_class, lock_name=lock_name) as lk: 48 # share lock in case workers are already using `tqdm` ---> 49 with PoolExecutor(max_workers=max_workers, initializer=tqdm_class.set_lock, 50 initargs=(lk,)) as ex: 51 return list(tqdm_class(ex.map(fn, *iterables, chunksize=chunksize), **kwargs)) File /usr/lib/python3.12/concurrent/futures/_base.py:647, in Executor.__exit__(self, exc_type, exc_val, exc_tb) 646 def __exit__(self, exc_type, exc_val, exc_tb): --> 647 self.shutdown(wait=True) 648 return False File /usr/lib/python3.12/concurrent/futures/thread.py:238, in ThreadPoolExecutor.shutdown(self, wait, cancel_futures) 236 if wait: 237 for t in self._threads: --> 238 t.join() File /usr/lib/python3.12/threading.py:1147, in Thread.join(self, timeout) 1144 raise RuntimeError("cannot join current thread") 1146 if timeout is None: -> 1147 self._wait_for_tstate_lock() 1148 else: 1149 # the behavior of a negative timeout isn't documented, but 1150 # historically .join(timeout=x) for x<0 has acted as if timeout=0 1151 self._wait_for_tstate_lock(timeout=max(timeout, 0)) File /usr/lib/python3.12/threading.py:1167, in Thread._wait_for_tstate_lock(self, block, timeout) 1164 return 1166 try: -> 1167 if lock.acquire(block, timeout): 1168 lock.release() 1169 self._stop() KeyboardInterrupt:
'full_seq']=full_seq df[
'full_seq']!=df['full_protein_seq']] df[df[
Seems the uniprot online download have limits length in downloading.
'full_seq'] = df['full_protein_seq'] df[
Get domain sequence
def get_region(seq,start,end): return seq[start-1:end]
'domain_seq'] = df.apply(lambda r: get_region(r['full_seq'],r['domain_start'],r['domain_end']),axis=1) df[
'domain_seq'] df[
0 AGFHEILGAGASGVVYKGQLEDELKTNIAVKTIHKLQPETEKEFMV...
1 YMLGDEIGKGAYGRVYIGLDLENGDFVAIKQVSLENIVQEDLNTIM...
2 YKVIAELGHGLWSRDVLAEHRFIKKRYILKILPSELSSSENFMRVF...
3 TLSDCQIGHGATATVFKMDIKIKKELQNGRMGEAVGDQMKAALKRF...
4 FDSGNVIGVGGSGATVYEGVLPSGSRVAVKRFQAIGSCTKAFDSEL...
...
5531 EIAHRFKNSMAMVQSIANQTLRNTYDPEQANRLFSERLRALSQAHD...
5532 EIAHRFKNSMAMVQSIANQTLRNTYDPEQANRLFSERLRALSQAHD...
5533 EIAHRFKNSMAMVQSIVNQTLRNTYDPEQANRLFSERLRALSQAHD...
5534 ALTGENPLVLGIVQDVTERKKAEANKALVSREIAHRFKNSMAMVQS...
5535 NPLVLGIVQDVTERKKAEANKALVSREIAHRFKNSMAMVQSIANQT...
Name: domain_seq, Length: 5536, dtype: object
'domain_seq'].isna().sum() df[
np.int64(0)
Arrange columns & save
= df.sort_values(['Uniprot','domain_start']).reset_index(drop=True) df
'KD_ID']=df['Uniprot']+'_'+df['Entry Name']+'_KD'+(df.Uniprot.duplicated().astype(int)+1).astype(str) df[
= ['KD_ID','Uniprot', 'Entry Name', 'Protein names', 'Gene Names','Gene Names (primary)',
col 'Organism', 'domain_note','domain_evidence', 'domain_start', 'domain_end','domain_seq' , 'Domain [FT]',
'Domain [CC]', 'Region', 'Motif', 'Protein families', 'Reactome', 'ComplexPortal',
'Subcellular location [CC]', 'Gene Ontology (biological process)',
'Tissue specificity', 'Interacts with', 'Subunit structure',
'Function [CC]', 'Activity regulation', 'full_seq',
]
= df[col] df
# df[df.KD_ID.str.contains('JAK1')] # check JAK1 to see if KD1 and KD2 is assigned correctly
=['kd_ID','Uniprot', 'Entry Name', 'Protein names', 'Gene Names','Gene Names (primary)',
new_colname'Organism', 'kd_note','kd_evidence', 'kd_start', 'kd_end','kd_seq' , 'Domain [FT]',
'Domain [CC]', 'Region', 'Motif', 'Protein families', 'Reactome', 'ComplexPortal',
'Subcellular location [CC]', 'Gene Ontology (biological process)',
'Tissue specificity', 'Interacts with', 'Subunit structure',
'Function [CC]', 'Activity regulation', 'full_seq',
]
=new_colname df.columns
# df.to_excel('out/uniprot_kd.xlsx',index=False)
df.shape
(5536, 27)
Kinases with 1-2 kinase domains
df.columns
Index(['kd_ID', 'Uniprot', 'Entry Name', 'Protein names', 'Gene Names',
'Gene Names (primary)', 'Organism', 'kd_note', 'kd_evidence',
'kd_start', 'kd_end', 'kd_seq', 'Domain [FT]', 'Domain [CC]', 'Region',
'Motif', 'Protein families', 'Reactome', 'ComplexPortal',
'Subcellular location [CC]', 'Gene Ontology (biological process)',
'Tissue specificity', 'Interacts with', 'Subunit structure',
'Function [CC]', 'Activity regulation', 'full_seq'],
dtype='object')
= df.groupby('Uniprot').agg({'kd_note':list}) out
out.kd_note.value_counts()
kd_note
[Protein kinase] 4521
[Histidine kinase] 621
[PI3K/PI4K catalytic] 168
[Protein kinase 1, Protein kinase 2] 73
[Alpha-type protein kinase] 22
[Protein kinase; inactive] 13
[HWE histidine kinase domain] 11
[Protein kinase, Guanylate kinase-like] 5
[Histidine kinase 1, Histidine kinase 2] 4
[Protein kinase, Histidine kinase] 4
[Amino-acid kinase domain (AAK)] 3
[Kinase domain] 2
[Histidine kinase; first part] 1
[Histidine kinase; second part] 1
[Protein kinase; truncated] 1
Name: count, dtype: int64
Some kinases contain two kinase domains. Some are both protein kinase domains, while some have the first to be protein kinase and the second to be histidine kinase.
Duplicates among kinase domains
Across species
def get_dup(df):
= df[df.kd_seq.duplicated(keep=False)].sort_values('kd_seq')
dup return dup.groupby('kd_seq').agg({'kd_ID':lambda x: ','.join(x)}).reset_index()
= get_dup(df) dup_unique
str.contains("HUMAN")] dup_unique[dup_unique.kd_ID.
kd_seq | kd_ID | |
---|---|---|
26 | EPWQEKVRRIREGSPYGHLPNWRLLSVIVKCGDDLRQELLAFQVLK... | B4UT09_PI4KB_OTOGA_KD1,A9X1A0_PI4KB_PAPAN_KD1,... |
30 | EYEGCKVGRGTYGHVYKARRKDGKDEKEYALKQIEGTGISMSACRE... | Q8BWD8_CDK19_MOUSE_KD1,Q9BWU1_CDK19_HUMAN_KD1 |
31 | FDFLKVIGKGSFGKVLLAKRKLDGKFYAVKVLQKKIVLNRKEQKHI... | Q96BR1_SGK3_HUMAN_KD1,Q5R7A7_SGK3_PONAB_KD1 |
33 | FDIIGIIGEGTYGQVYKARDKDTGEMVALKKVRLDNEKEGFPITAI... | E1BB52_CDK13_BOVIN_KD1,Q14004_CDK13_HUMAN_KD1,... |
34 | FDLIRVIGRGSYAKVLLVRLKKNDQIYAMKVVKKELVHDDEDIDWV... | P09217_KPCZ_RAT_KD1,Q05513_KPCZ_HUMAN_KD1,Q029... |
... | ... | ... |
271 | YTLKDEIGKGSYGVVKLAYNENDNTYYAMKVLSKKKLIRQAGFPRR... | Q96RR4_KKCC2_HUMAN_KD1,O88831_KKCC2_RAT_KD1 |
274 | YTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFEHQTYCQRTLR... | P46196_MK01_BOVIN_KD1,P28482_MK01_HUMAN_KD1 |
276 | YTRFEKIGQGASGTVYTALDIATGQEVAIKQMNLQQQPKKELIINE... | Q7YQL4_PAK3_PANTR_KD1,Q7YQL3_PAK3_PONPY_KD1,O7... |
277 | YTRFEKIGQGASGTVYTAMDVATGQEVAIKQMNLQQQPKKELIINE... | Q13153_PAK1_HUMAN_KD1,P35465_PAK1_RAT_KD1,Q08E... |
278 | YTRYEKIGQGASGTVFTATDVALGQEVAIKQINLQKQPKKELIINE... | Q13177_PAK2_HUMAN_KD1,Q8CIN4_PAK2_MOUSE_KD1 |
90 rows × 2 columns
Duplicates within human
=df[df.Organism=='Homo sapiens (Human)']
human
= get_dup(human) dup_human
dup_human
kd_seq | kd_ID |
---|
Check duplicates within other species
= df.Organism.value_counts().head().index species
for s in species:
=df[df.Organism==s]
df_speciesprint(s)
print(get_dup(df_species))
Arabidopsis thaliana (Mouse-ear cress)
Empty DataFrame
Columns: [kd_seq, kd_ID]
Index: []
Mus musculus (Mouse)
kd_seq \
0 YVMLETIGHGGCATVKLAQHRLTGTHVAVKTIRKREYWCNRVISEV...
kd_ID
0 C0HKC8_SMK3A_MOUSE_KD1,C0HKC9_SMK3B_MOUSE_KD1
Homo sapiens (Human)
Empty DataFrame
Columns: [kd_seq, kd_ID]
Index: []
Dictyostelium discoideum (Social amoeba)
Empty DataFrame
Columns: [kd_seq, kd_ID]
Index: []
Rattus norvegicus (Rat)
Empty DataFrame
Columns: [kd_seq, kd_ID]
Index: []
only mouse SMK3A/3B duplicates in the top 5 species