from katlas.data import *
from katlas.utils import *
import pandas as pd
from fastcore.all import L
import re
'display.max_rows', 20)
pd.set_option('display.max_columns', 100) pd.set_option(
Get all kinase domain sequences
Setup
Kinase domain info of current human kinome
= Data.get_kinase_info() df
sum() df.kinasecom_domain.notna().
516
# df.uniprot.to_csv('kinase_uniprot.csv')
Query the uniprot id in ID mapping in uniprot. Add domain [FT]
and region
in the columns
=pd.read_excel('raw/idmapping_kinase_info_2025_05_27.xlsx') uniprot
'Domain [FT]'].head() uniprot[
0 DOMAIN 46..315; /note="Protein kinase"; /evide...
1 DOMAIN 61..121; /note="SH3"; /evidence="ECO:00...
2 DOMAIN 107..167; /note="SH3"; /evidence="ECO:0...
3 DOMAIN 126..385; /note="Protein kinase"; /evid...
4 DOMAIN 192..485; /note="Protein kinase"; /evid...
Name: Domain [FT], dtype: object
FAM20C does not have Domain[FT]
but contains Region to indicate kinase domain
'Domain [FT]'].isna()].Region.tolist() uniprot[uniprot[
['REGION 62..81; /note="Disordered"; /evidence="ECO:0000256|SAM:MobiDB-lite"; REGION 94..159; /note="Disordered"; /evidence="ECO:0000256|SAM:MobiDB-lite"; REGION 354..565; /note="Kinase domain"; /evidence="ECO:0000305|PubMed:22582013"']
= uniprot.dropna(subset='Domain [FT]') uniprot
def extract_kinase_domains1(text):
"""
Extracts domain or region annotations from text using regex.
Includes entries where the note contains either 'kinase' or 'PI3K/PI4K catalytic'.
Returns a list of [note, region, evidence] where evidence is set to 'nan' if not present.
"""
= r'(?:REGION|DOMAIN) [<>]?(\d+)\.\.[<>]?(\d+); /note="([^"]*?(?:kinase|PI3K/PI4K catalytic)[^"]*?)"(?:; /evidence="([^"]*?)")?'
pattern = re.findall(pattern, text, flags=re.IGNORECASE)
matches
return [[note.strip(), evidence if evidence else 'nan']
for start, end, note, evidence in matches]
= uniprot['Domain [FT]'].apply(extract_kinase_domains1) out
The rest of the kinases have domain info:
out.explode().value_counts()
Domain [FT]
[Protein kinase, ECO:0000255|PROSITE-ProRule:PRU00159] 454
[AGC-kinase C-terminal, ECO:0000255|PROSITE-ProRule:PRU00618] 55
[Protein kinase 2, ECO:0000255|PROSITE-ProRule:PRU00159] 14
[Protein kinase 1, ECO:0000255|PROSITE-ProRule:PRU00159] 14
[Protein kinase, nan] 7
[Alpha-type protein kinase, ECO:0000255|PROSITE-ProRule:PRU00501] 6
[PI3K/PI4K catalytic, ECO:0000255|PROSITE-ProRule:PRU00269] 6
[Histidine kinase, ECO:0000255|PROSITE-ProRule:PRU00107] 4
[Protein kinase, ECO:0000255|PROSITE-ProRule:PRU00159, ECO:0000305] 4
[Guanylate kinase-like, ECO:0000255|PROSITE-ProRule:PRU00100] 1
[Protein kinase; inactive, ECO:0000255|PROSITE-ProRule:PRU00159] 1
[Protein kinase, ECO:0000255|PROSITE-ProRule:PRU00159, ECO:0000269|PubMed:15194684] 1
[Protein kinase, ECO:0000255|PROSITE-ProRule:PRU00159, ECO:0000269|PubMed:9092543] 1
[Protein kinase, ECO:0000250|UniProtKB:Q5VT25, ECO:0000255|PROSITE-ProRule:PRU00159] 1
[Protein kinase, ECO:0000255|PROSITE-ProRule:PRU00159, ECO:0000312|EMBL:CAA84485.1] 1
[Protein kinase, ECO:0000305] 1
Name: count, dtype: int64
We’ll search uniprot across species based on the info above to get all kinases and their kinase domains.
Download kinase domain info from Uniprot
Uniprot reviewed –> search advanced
–> domain: “Protein kinase”, or “PI3K PI4K catalytic” or “Histidine kinase”
or –> region: “Kinase domain”
=pd.read_excel('raw/uniprotkb_ft_domain_Protein_kinase_AND_2025_05_25.xlsx')
kd1=pd.read_excel('raw/uniprotkb_ft_domain_PI3K_PI4K_catalytic_2025_05_25.xlsx')
kd2=pd.read_excel('raw/uniprotkb_ft_domain_Histidine_kinase_AN_2025_05_25.xlsx')
kd3=pd.read_excel('raw/uniprotkb_ft_region_Kinase_domain_2025_05_26.xlsx') kd4
= pd.concat([kd1,kd2,kd3,kd4],ignore_index=True) kd_all
= kd_all[kd_all.Organism.str.contains('Homo')] kd_human
Check if how it overlap with current kinome
kinase domain query vs. kinome tree kinases
= Data.get_kinase_info() kinase_info
= get_diff(kd_human,kinase_info,'Uniprot','uniprot') a,b
5] # special for kinome tree b.iloc[:,:
kinase | ID_coral | uniprot | ID_HGNC | group | |
---|---|---|---|---|---|
44 | BRD2 | BRD2 | P25440 | BRD2 | Atypical |
45 | BRD3 | BRD3 | Q15059 | BRD3 | Atypical |
46 | BRD4 | BRD4 | O60885 | BRD4 | Atypical |
47 | BRDT | BRDT | Q58F21 | BRDT | Atypical |
472 | TRIM24 | TIF1a | O15164 | TRIM24 | Atypical |
473 | TRIM28 | TIF1b | Q13263 | TRIM28 | Atypical |
474 | TRIM33 | TIF1g | Q9UPN9 | TRIM33 | Atypical |
These kinases does not have clear boundary of kinase domain in the domain[FT] column.
5] # special for kinase domain query a.iloc[:,:
Uniprot | Reviewed | Entry Name | Protein names | Gene Names | |
---|---|---|---|---|---|
895 | P21675 | reviewed | TAF1_HUMAN | Transcription initiation factor TFIID subunit ... | TAF1 BA2R CCG1 CCGS TAF2A |
1982 | Q496M5 | reviewed | PLK5_HUMAN | Inactive serine/threonine-protein kinase PLK5 ... | PLK5 PLK5P FG060302 |
2448 | Q6A1A2 | reviewed | PDPK2_HUMAN | Putative 3-phosphoinositide-dependent protein ... | PDPK2P PDPK2 |
2894 | Q8NEV1 | reviewed | CSK23_HUMAN | Casein kinase II subunit alpha 3 (CK II alpha ... | CSNK2A3 CSNK2A1P |
3624 | Q9UQ88 | reviewed | CD11A_HUMAN | Cyclin-dependent kinase 11A (EC 2.7.11.22) (Ce... | CDK11A CDC2L2 CDC2L3 PITSLREB |
... | ... | ... | ... | ... | ... |
4761 | Q9UBF8 | reviewed | PI4KB_HUMAN | Phosphatidylinositol 4-kinase beta (PI4K-beta)... | PI4KB PIK4CB |
4790 | A4QPH2 | reviewed | PI4P2_HUMAN | Putative phosphatidylinositol 4-kinase alpha-l... | PI4KAP2 |
4812 | A4D2B8 | reviewed | PM2P1_HUMAN | Putative postmeiotic segregation increased 2-l... | PMS2P1 PMS2L1 PMS2L13 PMS2L6 PMS2L8 PMS3 PMS8 ... |
4878 | Q15119 | reviewed | PDK2_HUMAN | [Pyruvate dehydrogenase (acetyl-transferring)]... | PDK2 PDHK2 |
5441 | Q8N159 | reviewed | NAGS_HUMAN | N-acetylglutamate synthase, mitochondrial (EC ... | NAGS |
21 rows × 5 columns
Kinase keyword query contains a lot of non-serine/thronine/tyrosine kinases.
kinase domain query vs. kinase keyword query
= pd.read_excel('raw/uniprot_human_keyword_kinase.xlsx').rename(columns={'Entry':'Uniprot'}) all_kinase
= get_diff(kd_human,all_kinase,'Uniprot') a,b
5] # special for kd domain query a.iloc[:,:
Uniprot | Reviewed | Entry Name | Protein names | Gene Names | |
---|---|---|---|---|---|
1982 | Q496M5 | reviewed | PLK5_HUMAN | Inactive serine/threonine-protein kinase PLK5 ... | PLK5 PLK5P FG060302 |
5441 | Q8N159 | reviewed | NAGS_HUMAN | N-acetylglutamate synthase, mitochondrial (EC ... | NAGS |
kd domain query also include inactive kinase (PLK5), which is not marked with ‘kinase’ in the keywords.
Another one is NAGS, which contains Amino-acid kinase domain (AAK), and not marked with ‘kinase’ in the keywords.
5] # special for kinase keyword query b.iloc[:,:
Uniprot | Entry Name | Protein names | Gene Names | uniprot_keyword_kinase | |
---|---|---|---|---|---|
0 | A2RU49 | HYKK_HUMAN | Hydroxylysine kinase (5-hydroxy-L-lysine kinas... | HYKK AGPHD1 | 1 |
3 | O00142 | KITM_HUMAN | Thymidine kinase 2, mitochondrial (EC 2.7.1.21... | TK2 | 1 |
11 | O00746 | NDKM_HUMAN | Nucleoside diphosphate kinase, mitochondrial (... | NME4 NM23D | 1 |
13 | O00764 | PDXK_HUMAN | Pyridoxal kinase (EC 2.7.1.35) (Pyridoxine kin... | PDXK C21orf124 C21orf97 PKH PNK PRED79 | 1 |
23 | O14986 | PI51B_HUMAN | Phosphatidylinositol 4-phosphate 5-kinase type... | PIP5K1B STM7 | 1 |
... | ... | ... | ... | ... | ... |
638 | O60885 | BRD4_HUMAN | Bromodomain-containing protein 4 (Protein HUNK1) | BRD4 HUNK1 | 0 |
639 | Q58F21 | BRDT_HUMAN | Bromodomain testis-specific protein (Cancer/te... | BRDT | 0 |
663 | O15164 | TIF1A_HUMAN | Transcription intermediary factor 1-alpha (TIF... | TRIM24 RNF82 TIF1 TIF1A | 0 |
664 | Q13263 | TIF1B_HUMAN | Transcription intermediary factor 1-beta (TIF1... | TRIM28 KAP1 RNF96 TIF1B | 0 |
665 | Q9UPN9 | TRI33_HUMAN | E3 ubiquitin-protein ligase TRIM33 (EC 2.3.2.2... | TRIM33 KIAA1113 RFG7 TIF1G | 0 |
151 rows × 5 columns
Kinase keyword query contains a lot of non-serine/thronine/tyrosine kinases.
Extract kinase domain info
Get residue start, end, and evidence info.
def extract_kinase_domains(text):
"""
Extracts domain or region annotations from text using regex.
Includes entries where the note contains either 'kinase' or 'PI3K/PI4K catalytic'.
Returns a list of [note, region, evidence] where evidence is set to 'nan' if not present.
"""
= r'(?:REGION|DOMAIN) [<>]?(\d+)\.\.[<>]?(\d+); /note="([^"]*?(?:kinase|PI3K/PI4K catalytic)[^"]*?)"(?:; /evidence="([^"]*?)")?'
pattern = re.findall(pattern, text, flags=re.IGNORECASE)
matches
return [[note.strip(),int(start),int(end), evidence if evidence else 'nan']
for start, end, note, evidence in matches]
Region column
Specific for kd4, without domain info but with region info
'kd_info'] = kd4['Region'].apply(extract_kinase_domains) kd4[
'kd_info'] kd4[
0 [[Amino-acid kinase domain (AAK), 40, 361, ECO...
1 [[Kinase domain, 349, 560, ECO:0000250|UniProt...
2 [[Kinase domain, 354, 565, ECO:0000305|PubMed:...
3 [[Amino-acid kinase domain (AAK), 19, 376, nan]]
4 [[Amino-acid kinase domain (AAK), 19, 369, ECO...
5 [[HWE histidine kinase domain, 160, 236, nan]]
6 [[HWE histidine kinase domain, 285, 367, nan]]
7 [[HWE histidine kinase domain, 260, 303, nan]]
8 [[HWE histidine kinase domain, 259, 341, nan]]
9 [[HWE histidine kinase domain, 259, 341, nan]]
10 [[HWE histidine kinase domain, 286, 368, nan]]
11 [[HWE histidine kinase domain, 285, 367, nan]]
12 [[HWE histidine kinase domain, 285, 367, nan]]
13 [[HWE histidine kinase domain, 285, 367, nan]]
14 [[HWE histidine kinase domain, 254, 336, nan]]
15 [[HWE histidine kinase domain, 259, 341, nan]]
Name: kd_info, dtype: object
Domain
Most kinases contain domain info for kinase domain.
= pd.concat([kd1,kd2,kd3]) kd
= kd.drop_duplicates('Uniprot').reset_index(drop=True) kd
'kd_info'] = kd['Domain [FT]'].apply(extract_kinase_domains) kd[
'kd_info'].str.len().value_counts() # check non-zero kd[
kd_info
1 5015
2 397
3 22
Name: count, dtype: int64
Some kinase contain 3 kinase domains
'kd_info'].str.len()==3]['Domain [FT]'].tolist()[0] kd[kd[
'DOMAIN 49..318; /note="Protein kinase 1"; /evidence="ECO:0000255|PROSITE-ProRule:PRU00159"; DOMAIN 319..387; /note="AGC-kinase C-terminal"; /evidence="ECO:0000255|PROSITE-ProRule:PRU00618"; DOMAIN 426..687; /note="Protein kinase 2"; /evidence="ECO:0000255|PROSITE-ProRule:PRU00159"'
After check, it seems it includes C-terminal domain. We will later remove them
= pd.concat([kd,kd4]) kd_final
= kd_final.explode('kd_info', ignore_index=True) df
df
Uniprot | Reviewed | Entry Name | Protein names | Gene Names | Gene Names (primary) | Organism | Length | Domain [FT] | Domain [CC] | Motif | Protein families | Reactome | ComplexPortal | Subcellular location [CC] | Gene Ontology (biological process) | Tissue specificity | Interacts with | Subunit structure | Function [CC] | Activity regulation | Mass | Sequence | kd_info | Region | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A0A075F7E9 | reviewed | LERK1_ORYSI | G-type lectin S-receptor-like serine/threonine... | LECRK1 LECRK OsI_14840 | LECRK1 | Oryza sativa subsp. indica (Rice) | 813 | DOMAIN 22..149; /note="Bulb-type lectin"; /evi... | NaN | NaN | Protein kinase superfamily, Ser/Thr protein ki... | NaN | NaN | SUBCELLULAR LOCATION: Membrane {ECO:0000255}; ... | defense response [GO:0006952]; response to oth... | TISSUE SPECIFICITY: Expressed in plumules, rad... | NaN | SUBUNIT: Interacts (via kinase domain) with AD... | FUNCTION: Involved in innate immunity. Require... | NaN | 90770 | MVALLLFPMLLQLLSPTCAQTQKNITLGSTLAPQGPASSWLSPSGD... | [Protein kinase, 523, 797, ECO:0000255|PROSITE... | NaN |
1 | A0A078CGE6 | reviewed | M3KE1_BRANA | MAP3K epsilon protein kinase 1 (BnM3KE1) (EC 2... | M3KE1 BnaA03g30290D GSBRNA2T00111755001 | M3KE1 | Brassica napus (Rape) | 1299 | DOMAIN 20..274; /note="Protein kinase"; /evide... | NaN | NaN | Protein kinase superfamily, Ser/Thr protein ki... | NaN | NaN | SUBCELLULAR LOCATION: Cytoplasm, cytoskeleton,... | cell division [GO:0051301]; protein autophosph... | TISSUE SPECIFICITY: Expressed in both the spor... | NaN | NaN | FUNCTION: Serine/threonine-protein kinase invo... | NaN | 143609 | MARQMTSSQFHKSKTLDNKYMLGDEIGKGAYGRVYIGLDLENGDFV... | [Protein kinase, 20, 274, ECO:0000255|PROSITE-... | NaN |
2 | A0A0H3MBJ2 | reviewed | PKN1_CHLT2 | Serine/threonine-protein kinase Pkn1 (EC 2.7.1... | pkn1 CTL0400 | pkn1 | Chlamydia trachomatis serovar L2 (strain ATCC ... | 614 | DOMAIN 13..276; /note="Protein kinase"; /evide... | NaN | NaN | Protein kinase superfamily, Ser/Thr protein ki... | NaN | NaN | NaN | NaN | NaN | NaN | SUBUNIT: Interacts with PknD, interacts with a... | FUNCTION: Together with the serine/threonine k... | NaN | 69638 | MEERAAVEYWGDYKVIAELGHGLWSRDVLAEHRFIKKRYILKILPS... | [Protein kinase, 13, 276, ECO:0000255|PROSITE-... | NaN |
3 | A0A0K3AV08 | reviewed | MLK1_CAEEL | Mitogen-activated protein kinase kinase kinase... | mlk-1 K11D12.10 | mlk-1 | Caenorhabditis elegans | 1059 | DOMAIN 69..130; /note="SH3"; /evidence="ECO:00... | NaN | MOTIF 937..940; /note="NPQY motif"; /evidence=... | Protein kinase superfamily, STE Ser/Thr protei... | NaN | NaN | NaN | axon regeneration [GO:0031103]; defense respon... | TISSUE SPECIFICITY: Expressed in pharynx, inte... | NaN | SUBUNIT: Interacts with max-2; the interaction... | FUNCTION: Serine/threonine-protein kinase whic... | ACTIVITY REGULATION: Activated by phosphorylat... | 117635 | MEQASVPSYVNIPPIAKTRSTSHLAPTPEHHRSVSYEDTTTASTST... | [Protein kinase, 150, 454, ECO:0000255|PROSITE... | NaN |
4 | A0A0P0VIP0 | reviewed | LRSK7_ORYSJ | L-type lectin-domain containing receptor kinas... | LECRKS7 DAF1 Os02g0459600 LOC_Os02g26160 | LECRKS7 | Oryza sativa subsp. japonica (Rice) | 695 | DOMAIN 389..661; /note="Protein kinase"; /evid... | NaN | NaN | Leguminous lectin family; Protein kinase super... | NaN | NaN | SUBCELLULAR LOCATION: Cell membrane {ECO:00002... | pollen aperture formation [GO:0062075]; pollen... | TISSUE SPECIFICITY: Expressed in roots, leaves... | NaN | SUBUNIT: Interacts with INP1 (PubMed:32284546)... | FUNCTION: Legume-lectin receptor-like kinase r... | NaN | 74135 | MPPRCRRLPLLFILLLAVRPLSAAAASSIAAAPASSYRRISWASNL... | [Protein kinase, 389, 661, ECO:0000255|PROSITE... | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5886 | A9MBM8 | reviewed | LOVHK_BRUC2 | Blue-light-activated histidine kinase (EC 2.7.... | BCAN_B0589 | NaN | Brucella canis (strain ATCC 23365 / NCTC 10854... | 489 | DOMAIN 19..93; /note="PAS"; /evidence="ECO:000... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FUNCTION: Photosensitive kinase that is involv... | NaN | 54874 | MAIDLRPFIPFGRGALSQATDPFRAAVEFTLMPMLITNPHLPDNPI... | [HWE histidine kinase domain, 285, 367, nan] | REGION 285..367; /note="HWE histidine kinase d... |
5887 | A9WYQ7 | reviewed | LOVHK_BRUSI | Blue-light-activated histidine kinase (EC 2.7.... | BSUIS_B0585 | NaN | Brucella suis (strain ATCC 23445 / NCTC 10510) | 489 | DOMAIN 19..93; /note="PAS"; /evidence="ECO:000... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FUNCTION: Photosensitive kinase that is involv... | NaN | 54874 | MAIDLRPFIPFGRGALSQATDPFRAAVEFTLMPMLITNPHLPDNPI... | [HWE histidine kinase domain, 285, 367, nan] | REGION 285..367; /note="HWE histidine kinase d... |
5888 | B2SB67 | reviewed | LOVHK_BRUA1 | Blue-light-activated histidine kinase (EC 2.7.... | BAbS19_II06090 | NaN | Brucella abortus (strain S19) | 489 | DOMAIN 19..93; /note="PAS"; /evidence="ECO:000... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FUNCTION: Photosensitive kinase that is involv... | NaN | 54902 | MAIDLRPFIPFGRGALSQATDPFRAAVEFTLMPMLITNPHLPDNPI... | [HWE histidine kinase domain, 285, 367, nan] | REGION 285..367; /note="HWE histidine kinase d... |
5889 | Q577Y7 | reviewed | LOVHK_BRUAB | Blue-light-activated histidine kinase (EC 2.7.... | BruAb2_0636 | NaN | Brucella abortus biovar 1 (strain 9-941) | 489 | DOMAIN 19..93; /note="PAS"; /evidence="ECO:000... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FUNCTION: Photosensitive kinase that is involv... | NaN | 54874 | MAIDLRPFIPFGRGALSQATDPFRAAVEFTLMPMLITNPHLPDNPI... | [HWE histidine kinase domain, 254, 336, nan] | REGION 254..336; /note="HWE histidine kinase d... |
5890 | Q8FW73 | reviewed | LOVHK_BRUSU | Blue-light-activated histidine kinase (EC 2.7.... | BRA0588 BS1330_II0583 | NaN | Brucella suis biovar 1 (strain 1330) | 489 | DOMAIN 19..93; /note="PAS"; /evidence="ECO:000... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FUNCTION: Photosensitive kinase that is involv... | NaN | 54874 | MAIDLRPFIPFGRGALSQATDPFRAAVEFTLMPMLITNPHLPDNPI... | [HWE histidine kinase domain, 259, 341, nan] | REGION 259..341; /note="HWE histidine kinase d... |
5891 rows × 25 columns
'domain_note', 'domain_start','domain_end', 'domain_evidence']] = df.kd_info.apply(pd.Series) df[[
df
Uniprot | Reviewed | Entry Name | Protein names | Gene Names | Gene Names (primary) | Organism | Length | Domain [FT] | Domain [CC] | Motif | Protein families | Reactome | ComplexPortal | Subcellular location [CC] | Gene Ontology (biological process) | Tissue specificity | Interacts with | Subunit structure | Function [CC] | Activity regulation | Mass | Sequence | kd_info | Region | domain_note | domain_start | domain_end | domain_evidence | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A0A075F7E9 | reviewed | LERK1_ORYSI | G-type lectin S-receptor-like serine/threonine... | LECRK1 LECRK OsI_14840 | LECRK1 | Oryza sativa subsp. indica (Rice) | 813 | DOMAIN 22..149; /note="Bulb-type lectin"; /evi... | NaN | NaN | Protein kinase superfamily, Ser/Thr protein ki... | NaN | NaN | SUBCELLULAR LOCATION: Membrane {ECO:0000255}; ... | defense response [GO:0006952]; response to oth... | TISSUE SPECIFICITY: Expressed in plumules, rad... | NaN | SUBUNIT: Interacts (via kinase domain) with AD... | FUNCTION: Involved in innate immunity. Require... | NaN | 90770 | MVALLLFPMLLQLLSPTCAQTQKNITLGSTLAPQGPASSWLSPSGD... | [Protein kinase, 523, 797, ECO:0000255|PROSITE... | NaN | Protein kinase | 523 | 797 | ECO:0000255|PROSITE-ProRule:PRU00159 |
1 | A0A078CGE6 | reviewed | M3KE1_BRANA | MAP3K epsilon protein kinase 1 (BnM3KE1) (EC 2... | M3KE1 BnaA03g30290D GSBRNA2T00111755001 | M3KE1 | Brassica napus (Rape) | 1299 | DOMAIN 20..274; /note="Protein kinase"; /evide... | NaN | NaN | Protein kinase superfamily, Ser/Thr protein ki... | NaN | NaN | SUBCELLULAR LOCATION: Cytoplasm, cytoskeleton,... | cell division [GO:0051301]; protein autophosph... | TISSUE SPECIFICITY: Expressed in both the spor... | NaN | NaN | FUNCTION: Serine/threonine-protein kinase invo... | NaN | 143609 | MARQMTSSQFHKSKTLDNKYMLGDEIGKGAYGRVYIGLDLENGDFV... | [Protein kinase, 20, 274, ECO:0000255|PROSITE-... | NaN | Protein kinase | 20 | 274 | ECO:0000255|PROSITE-ProRule:PRU00159 |
2 | A0A0H3MBJ2 | reviewed | PKN1_CHLT2 | Serine/threonine-protein kinase Pkn1 (EC 2.7.1... | pkn1 CTL0400 | pkn1 | Chlamydia trachomatis serovar L2 (strain ATCC ... | 614 | DOMAIN 13..276; /note="Protein kinase"; /evide... | NaN | NaN | Protein kinase superfamily, Ser/Thr protein ki... | NaN | NaN | NaN | NaN | NaN | NaN | SUBUNIT: Interacts with PknD, interacts with a... | FUNCTION: Together with the serine/threonine k... | NaN | 69638 | MEERAAVEYWGDYKVIAELGHGLWSRDVLAEHRFIKKRYILKILPS... | [Protein kinase, 13, 276, ECO:0000255|PROSITE-... | NaN | Protein kinase | 13 | 276 | ECO:0000255|PROSITE-ProRule:PRU00159 |
3 | A0A0K3AV08 | reviewed | MLK1_CAEEL | Mitogen-activated protein kinase kinase kinase... | mlk-1 K11D12.10 | mlk-1 | Caenorhabditis elegans | 1059 | DOMAIN 69..130; /note="SH3"; /evidence="ECO:00... | NaN | MOTIF 937..940; /note="NPQY motif"; /evidence=... | Protein kinase superfamily, STE Ser/Thr protei... | NaN | NaN | NaN | axon regeneration [GO:0031103]; defense respon... | TISSUE SPECIFICITY: Expressed in pharynx, inte... | NaN | SUBUNIT: Interacts with max-2; the interaction... | FUNCTION: Serine/threonine-protein kinase whic... | ACTIVITY REGULATION: Activated by phosphorylat... | 117635 | MEQASVPSYVNIPPIAKTRSTSHLAPTPEHHRSVSYEDTTTASTST... | [Protein kinase, 150, 454, ECO:0000255|PROSITE... | NaN | Protein kinase | 150 | 454 | ECO:0000255|PROSITE-ProRule:PRU00159 |
4 | A0A0P0VIP0 | reviewed | LRSK7_ORYSJ | L-type lectin-domain containing receptor kinas... | LECRKS7 DAF1 Os02g0459600 LOC_Os02g26160 | LECRKS7 | Oryza sativa subsp. japonica (Rice) | 695 | DOMAIN 389..661; /note="Protein kinase"; /evid... | NaN | NaN | Leguminous lectin family; Protein kinase super... | NaN | NaN | SUBCELLULAR LOCATION: Cell membrane {ECO:00002... | pollen aperture formation [GO:0062075]; pollen... | TISSUE SPECIFICITY: Expressed in roots, leaves... | NaN | SUBUNIT: Interacts with INP1 (PubMed:32284546)... | FUNCTION: Legume-lectin receptor-like kinase r... | NaN | 74135 | MPPRCRRLPLLFILLLAVRPLSAAAASSIAAAPASSYRRISWASNL... | [Protein kinase, 389, 661, ECO:0000255|PROSITE... | NaN | Protein kinase | 389 | 661 | ECO:0000255|PROSITE-ProRule:PRU00159 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5886 | A9MBM8 | reviewed | LOVHK_BRUC2 | Blue-light-activated histidine kinase (EC 2.7.... | BCAN_B0589 | NaN | Brucella canis (strain ATCC 23365 / NCTC 10854... | 489 | DOMAIN 19..93; /note="PAS"; /evidence="ECO:000... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FUNCTION: Photosensitive kinase that is involv... | NaN | 54874 | MAIDLRPFIPFGRGALSQATDPFRAAVEFTLMPMLITNPHLPDNPI... | [HWE histidine kinase domain, 285, 367, nan] | REGION 285..367; /note="HWE histidine kinase d... | HWE histidine kinase domain | 285 | 367 | nan |
5887 | A9WYQ7 | reviewed | LOVHK_BRUSI | Blue-light-activated histidine kinase (EC 2.7.... | BSUIS_B0585 | NaN | Brucella suis (strain ATCC 23445 / NCTC 10510) | 489 | DOMAIN 19..93; /note="PAS"; /evidence="ECO:000... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FUNCTION: Photosensitive kinase that is involv... | NaN | 54874 | MAIDLRPFIPFGRGALSQATDPFRAAVEFTLMPMLITNPHLPDNPI... | [HWE histidine kinase domain, 285, 367, nan] | REGION 285..367; /note="HWE histidine kinase d... | HWE histidine kinase domain | 285 | 367 | nan |
5888 | B2SB67 | reviewed | LOVHK_BRUA1 | Blue-light-activated histidine kinase (EC 2.7.... | BAbS19_II06090 | NaN | Brucella abortus (strain S19) | 489 | DOMAIN 19..93; /note="PAS"; /evidence="ECO:000... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FUNCTION: Photosensitive kinase that is involv... | NaN | 54902 | MAIDLRPFIPFGRGALSQATDPFRAAVEFTLMPMLITNPHLPDNPI... | [HWE histidine kinase domain, 285, 367, nan] | REGION 285..367; /note="HWE histidine kinase d... | HWE histidine kinase domain | 285 | 367 | nan |
5889 | Q577Y7 | reviewed | LOVHK_BRUAB | Blue-light-activated histidine kinase (EC 2.7.... | BruAb2_0636 | NaN | Brucella abortus biovar 1 (strain 9-941) | 489 | DOMAIN 19..93; /note="PAS"; /evidence="ECO:000... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FUNCTION: Photosensitive kinase that is involv... | NaN | 54874 | MAIDLRPFIPFGRGALSQATDPFRAAVEFTLMPMLITNPHLPDNPI... | [HWE histidine kinase domain, 254, 336, nan] | REGION 254..336; /note="HWE histidine kinase d... | HWE histidine kinase domain | 254 | 336 | nan |
5890 | Q8FW73 | reviewed | LOVHK_BRUSU | Blue-light-activated histidine kinase (EC 2.7.... | BRA0588 BS1330_II0583 | NaN | Brucella suis biovar 1 (strain 1330) | 489 | DOMAIN 19..93; /note="PAS"; /evidence="ECO:000... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FUNCTION: Photosensitive kinase that is involv... | NaN | 54874 | MAIDLRPFIPFGRGALSQATDPFRAAVEFTLMPMLITNPHLPDNPI... | [HWE histidine kinase domain, 259, 341, nan] | REGION 259..341; /note="HWE histidine kinase d... | HWE histidine kinase domain | 259 | 341 | nan |
5891 rows × 29 columns
= df.drop(columns=['kd_info']) df
Remove non kinase domains:
= df[df.domain_note!='AGC-kinase C-terminal'].reset_index(drop=True) df
df.domain_note.value_counts()
domain_note
Protein kinase 4530
Histidine kinase 625
PI3K/PI4K catalytic 168
Protein kinase 2 73
Protein kinase 1 73
Alpha-type protein kinase 22
Protein kinase; inactive 13
HWE histidine kinase domain 11
Guanylate kinase-like 5
Histidine kinase 2 4
Histidine kinase 1 4
Amino-acid kinase domain (AAK) 3
Kinase domain 2
Protein kinase; truncated 1
Histidine kinase; first part 1
Histidine kinase; second part 1
Name: count, dtype: int64
Download uniprot sequence
The big TITIN proteins (A2ASS6 for mouse, and Q8WZ42 for human) is truncated when downloaded online.
from tqdm.contrib.concurrent import thread_map
from kdock.core.protein import get_uniprot_seq
= df.rename(columns={'Sequence':'full_protein_seq'}) df
= thread_map(get_uniprot_seq,df.Uniprot, max_workers=5) full_seq
CPU times: user 28.3 s, sys: 4.08 s, total: 32.4 s
Wall time: 13min 6s
'full_seq']=full_seq df[
'full_seq']!=df['full_protein_seq']] df[df[
KD_ID | Uniprot | Entry Name | Protein names | Gene Names | Gene Names (primary) | Organism | domain_note | domain_evidence | domain_start | ... | ComplexPortal | Subcellular location [CC] | Gene Ontology (biological process) | Tissue specificity | Interacts with | Subunit structure | Function [CC] | Activity regulation | full_protein_seq | full_seq | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
91 | A2ASS6_TITIN_MOUSE_KD1 | A2ASS6 | TITIN_MOUSE | Titin (EC 2.7.11.1) (Connectin) | Ttn | Ttn | Mus musculus (Mouse) | Protein kinase | ECO:0000255|PROSITE-ProRule:PRU00159 | 33040 | ... | CPX-127; | SUBCELLULAR LOCATION: Cytoplasm {ECO:0000305}.... | adult heart development [GO:0007512]; cardiac ... | NaN | Q70IV5 | SUBUNIT: Interacts with MYOM1, MYOM2, tropomyo... | FUNCTION: Key component in the assembly and fu... | ACTIVITY REGULATION: Full activation of the pr... | MTTQAPMFTQPLQSVVVLEGSTATFEAHVSGSPVPEVSWFRDGQVI... | MTTQAPMFTQPLQSVVVLEGSTATFEAHVSGSPVPEVSWFRDGQVI... |
4500 | Q8WZ42_TITIN_HUMAN_KD1 | Q8WZ42 | TITIN_HUMAN | Titin (EC 2.7.11.1) (Connectin) (Rhabdomyosarc... | TTN | TTN | Homo sapiens (Human) | Protein kinase | ECO:0000255|PROSITE-ProRule:PRU00159 | 32178 | ... | CPX-101; | SUBCELLULAR LOCATION: Cytoplasm {ECO:0000305|P... | cardiac muscle cell development [GO:0055013]; ... | TISSUE SPECIFICITY: Isoforms 3, 7 and 8 are ex... | P12814; P35609; P62158; P20807; O75953; O75923... | SUBUNIT: Interacts with MYOM1, MYOM2, tropomyo... | FUNCTION: Key component in the assembly and fu... | ACTIVITY REGULATION: Full activation of the pr... | MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... | MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI... |
2 rows × 28 columns
Seems the uniprot online download have limits length in downloading.
Get domain sequence
def get_region(seq,start,end): return seq[start-1:end]
'domain_seq'] = df.apply(lambda r: get_region(r['full_seq'],r['domain_start'],r['domain_end']),axis=1) df[
'domain_seq'] df[
0 AGFHEILGAGASGVVYKGQLEDELKTNIAVKTIHKLQPETEKEFMV...
1 RVSTISTARASYSSIFSGNVAEHAIVNKQKVSVKRHVQRRAITFSR...
2 YMLGDEIGKGAYGRVYIGLDLENGDFVAIKQVSLENIVQEDLNTIM...
3 RLEECRIMSSAKRPLWLNWENPDIMSELLFQNNEIIFKNGDDLRQD...
4 GASHELKTPLASLRIILENMQHNIGDYKDHPKYIAKSINKIDQMSH...
...
5531 YILKDLLGTGAFSQVRLAEVKEDPSRVVAIKIIDKKALKGKEDSLE...
5532 YTVEKEIGKGSFAIVYKGVSLRDGRNIAIKAVSRSKLKNKKLLENL...
5533 LETERIIGRGTFGTVKLVHHKPTKIRYALKCVSKRSIINLNQQNNI...
5534 LKFDEELGRGSFKTVFRGLDTETGVAVAWCELQESKLNKTERQRFR...
5535 EASQRSFASGPSTSTKLTVESRTETTRFIFYIYQVRNNEVVAANKH...
Name: domain_seq, Length: 5536, dtype: object
'domain_seq'].isna().sum() df[
0
Arrange columns & save
= df.sort_values(['Uniprot','domain_start']).reset_index(drop=True) df
'KD_ID']=df['Uniprot']+'_'+df['Entry Name']+'_KD'+(df.Uniprot.duplicated().astype(int)+1).astype(str) df[
= ['KD_ID','Uniprot', 'Entry Name', 'Protein names', 'Gene Names','Gene Names (primary)',
col 'Organism', 'domain_note','domain_evidence', 'domain_start', 'domain_end','domain_seq' , 'Domain [FT]',
'Domain [CC]', 'Region', 'Motif', 'Protein families', 'Reactome', 'ComplexPortal',
'Subcellular location [CC]', 'Gene Ontology (biological process)',
'Tissue specificity', 'Interacts with', 'Subunit structure',
'Function [CC]', 'Activity regulation', 'full_seq',
]
= df[col] df
# df[df.KD_ID.str.contains('JAK1')] # check JAK1 to see if KD1 and KD2 is assigned correctly
=['kd_ID','Uniprot', 'Entry Name', 'Protein names', 'Gene Names','Gene Names (primary)',
new_colname'Organism', 'kd_note','kd_evidence', 'kd_start', 'kd_end','kd_seq' , 'Domain [FT]',
'Domain [CC]', 'Region', 'Motif', 'Protein families', 'Reactome', 'ComplexPortal',
'Subcellular location [CC]', 'Gene Ontology (biological process)',
'Tissue specificity', 'Interacts with', 'Subunit structure',
'Function [CC]', 'Activity regulation', 'full_seq',
]
=new_colname df.columns
# df.to_excel('out/uniprot_kd.xlsx',index=False)
df.shape
(5536, 27)
Kinases with 1-2 kinase domains
= df.groupby('Uniprot').agg({'domain_note':list}) out
out.domain_note.value_counts()
domain_note
[Protein kinase] 4521
[Histidine kinase] 621
[PI3K/PI4K catalytic] 168
[Protein kinase 1, Protein kinase 2] 73
[Alpha-type protein kinase] 22
[Protein kinase; inactive] 13
[HWE histidine kinase domain] 11
[Protein kinase, Guanylate kinase-like] 5
[Histidine kinase 1, Histidine kinase 2] 4
[Protein kinase, Histidine kinase] 4
[Amino-acid kinase domain (AAK)] 3
[Kinase domain] 2
[Histidine kinase; first part] 1
[Histidine kinase; second part] 1
[Protein kinase; truncated] 1
Name: count, dtype: int64
Some kinases contain two kinase domains. Some are both protein kinase domains, while some have the first to be protein kinase and the second to be histidine kinase.
Duplicates among kinase domains
Across species
def get_dup(df):
= df[df.kd_seq.duplicated(keep=False)].sort_values('kd_seq')
dup return dup.groupby('kd_seq').agg({'kd_ID':lambda x: ','.join(x)}).reset_index()
= get_dup(df) dup_unique
str.contains("HUMAN")] dup_unique[dup_unique.kd_ID.
kd_seq | kd_ID | |
---|---|---|
26 | EPWQEKVRRIREGSPYGHLPNWRLLSVIVKCGDDLRQELLAFQVLK... | B4UT09_PI4KB_OTOGA_KD1,A9X1A0_PI4KB_PAPAN_KD1,... |
30 | EYEGCKVGRGTYGHVYKARRKDGKDEKEYALKQIEGTGISMSACRE... | Q8BWD8_CDK19_MOUSE_KD1,Q9BWU1_CDK19_HUMAN_KD1 |
31 | FDFLKVIGKGSFGKVLLAKRKLDGKFYAVKVLQKKIVLNRKEQKHI... | Q96BR1_SGK3_HUMAN_KD1,Q5R7A7_SGK3_PONAB_KD1 |
33 | FDIIGIIGEGTYGQVYKARDKDTGEMVALKKVRLDNEKEGFPITAI... | E1BB52_CDK13_BOVIN_KD1,Q14004_CDK13_HUMAN_KD1,... |
34 | FDLIRVIGRGSYAKVLLVRLKKNDQIYAMKVVKKELVHDDEDIDWV... | P09217_KPCZ_RAT_KD1,Q05513_KPCZ_HUMAN_KD1,Q029... |
... | ... | ... |
271 | YTLKDEIGKGSYGVVKLAYNENDNTYYAMKVLSKKKLIRQAGFPRR... | Q96RR4_KKCC2_HUMAN_KD1,O88831_KKCC2_RAT_KD1 |
274 | YTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFEHQTYCQRTLR... | P46196_MK01_BOVIN_KD1,P28482_MK01_HUMAN_KD1 |
276 | YTRFEKIGQGASGTVYTALDIATGQEVAIKQMNLQQQPKKELIINE... | Q7YQL4_PAK3_PANTR_KD1,Q7YQL3_PAK3_PONPY_KD1,O7... |
277 | YTRFEKIGQGASGTVYTAMDVATGQEVAIKQMNLQQQPKKELIINE... | Q13153_PAK1_HUMAN_KD1,P35465_PAK1_RAT_KD1,Q08E... |
278 | YTRYEKIGQGASGTVFTATDVALGQEVAIKQINLQKQPKKELIINE... | Q13177_PAK2_HUMAN_KD1,Q8CIN4_PAK2_MOUSE_KD1 |
90 rows × 2 columns
Duplicates within human
=df[df.Organism=='Homo sapiens (Human)']
human
= get_dup(human) dup_human
dup_human
kd_seq | kd_ID |
---|
Check duplicates within other species
= df.Organism.value_counts().head().index species
for s in species:
=df[df.Organism==s]
df_speciesprint(s)
print(get_dup(df_species))
Arabidopsis thaliana (Mouse-ear cress)
Empty DataFrame
Columns: [kd_seq, kd_ID]
Index: []
Mus musculus (Mouse)
kd_seq \
0 YVMLETIGHGGCATVKLAQHRLTGTHVAVKTIRKREYWCNRVISEV...
kd_ID
0 C0HKC8_SMK3A_MOUSE_KD1,C0HKC9_SMK3B_MOUSE_KD1
Homo sapiens (Human)
Empty DataFrame
Columns: [kd_seq, kd_ID]
Index: []
Dictyostelium discoideum (Social amoeba)
Empty DataFrame
Columns: [kd_seq, kd_ID]
Index: []
Rattus norvegicus (Rat)
Empty DataFrame
Columns: [kd_seq, kd_ID]
Index: []
only mouse SMK3A/3B duplicates in the top 5 species