Alignment

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO, AlignIO
import pandas as pd

Prepare fasta file for clustalo

kd=pd.read_excel('out/uniprot_kd.xlsx')

kd

	kd_ID	Uniprot	Entry Name	Protein names	Gene Names	Gene Names (primary)	Organism	kd_note	kd_evidence	kd_start	...	Reactome	ComplexPortal	Subcellular location [CC]	Gene Ontology (biological process)	Tissue specificity	Interacts with	Subunit structure	Function [CC]	Activity regulation	full_seq
0	A0A075F7E9_LERK1_ORYSI_KD1	A0A075F7E9	LERK1_ORYSI	G-type lectin S-receptor-like serine/threonine...	LECRK1 LECRK OsI_14840	LECRK1	Oryza sativa subsp. indica (Rice)	Protein kinase	ECO:0000255\|PROSITE-ProRule:PRU00159	523	...	NaN	NaN	SUBCELLULAR LOCATION: Membrane {ECO:0000255}; ...	defense response [GO:0006952]; response to oth...	TISSUE SPECIFICITY: Expressed in plumules, rad...	NaN	SUBUNIT: Interacts (via kinase domain) with AD...	FUNCTION: Involved in innate immunity. Require...	NaN	MVALLLFPMLLQLLSPTCAQTQKNITLGSTLAPQGPASSWLSPSGD...
1	A0A078BQP2_GCY25_CAEEL_KD1	A0A078BQP2	GCY25_CAEEL	Receptor-type guanylate cyclase gcy-25 (EC 4.6...	gcy-25 Y105C5B.2	gcy-25	Caenorhabditis elegans	Protein kinase	ECO:0000255\|PROSITE-ProRule:PRU00159	464	...	R-CEL-2514859;	NaN	SUBCELLULAR LOCATION: Cell membrane {ECO:00003...	cGMP biosynthetic process [GO:0006182]; intrac...	TISSUE SPECIFICITY: Expressed in AQR, PQR and ...	NaN	NaN	FUNCTION: Guanylate cyclase involved in the pr...	NaN	MLLLLLLLKISTFVDSFQIGHLEFENSNETRILEICMKNAGSWRDH...
2	A0A078CGE6_M3KE1_BRANA_KD1	A0A078CGE6	M3KE1_BRANA	MAP3K epsilon protein kinase 1 (BnM3KE1) (EC 2...	M3KE1 BnaA03g30290D GSBRNA2T00111755001	M3KE1	Brassica napus (Rape)	Protein kinase	ECO:0000255\|PROSITE-ProRule:PRU00159	20	...	NaN	NaN	SUBCELLULAR LOCATION: Cytoplasm, cytoskeleton,...	cell division [GO:0051301]; protein autophosph...	TISSUE SPECIFICITY: Expressed in both the spor...	NaN	NaN	FUNCTION: Serine/threonine-protein kinase invo...	NaN	MARQMTSSQFHKSKTLDNKYMLGDEIGKGAYGRVYIGLDLENGDFV...
3	A0A0G2K344_PK3CA_RAT_KD1	A0A0G2K344	PK3CA_RAT	Phosphatidylinositol 4,5-bisphosphate 3-kinase...	Pik3ca	Pik3ca	Rattus norvegicus (Rat)	PI3K/PI4K catalytic	ECO:0000255\|PROSITE-ProRule:PRU00269	765	...	R-RNO-109704;R-RNO-112399;R-RNO-114604;R-RNO-1...	NaN	NaN	actin cytoskeleton organization [GO:0030036]; ...	TISSUE SPECIFICITY: Detected in the hypothalam...	NaN	SUBUNIT: Heterodimer of a catalytic subunit PI...	FUNCTION: Phosphoinositide-3-kinase (PI3K) pho...	NaN	MPPRPSSGELWGIHLMPPRILVECLLPNGMIVTLECLREATLVTIK...
4	A0A0H2ZM62_HK06_STRP2_KD1	A0A0H2ZM62	HK06_STRP2	Sensor histidine protein kinase HK06 (EC 2.7.1...	hk06 SPD_2019	hk06	Streptococcus pneumoniae serotype 2 (strain D3...	Histidine kinase	ECO:0000255\|PROSITE-ProRule:PRU00107	239	...	NaN	NaN	SUBCELLULAR LOCATION: Cell membrane {ECO:00003...	NaN	NaN	NaN	NaN	FUNCTION: Member of the two-component regulato...	NaN	MIKNPKLLTKSFLRSFAILGGVGLVIHIAIYLTFPFYYIQLEGEKF...
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5531	W0LYS5_CAMKI_MACNP_KD1	W0LYS5	CAMKI_MACNP	Calcium/calmodulin-dependent protein kinase ty...	CaMKI	CaMKI	Macrobrachium nipponense (Oriental river shrim...	Protein kinase	ECO:0000255\|PROSITE-ProRule:PRU00159	31	...	NaN	NaN	NaN	molting cycle process [GO:0022404]	TISSUE SPECIFICITY: Highly expressed in hepato...	NaN	NaN	FUNCTION: Calcium/calmodulin-dependent protein...	ACTIVITY REGULATION: Activated by Ca(2+)/calmo...	MPLFGSKKETAKKSSKKDKDEGKMPAVEDKYILKDLLGTGAFSQVR...
5532	W0T9X4_ATG1_KLUMD_KD1	W0T9X4	ATG1_KLUMD	Serine/threonine-protein kinase ATG1 (EC 2.7.1...	ATG1 KLMA_30321	ATG1	Kluyveromyces marxianus (strain DMKU3-1042 / B...	Protein kinase	ECO:0000255\|PROSITE-ProRule:PRU00159	21	...	NaN	NaN	SUBCELLULAR LOCATION: Cytoplasm {ECO:0000250\|U...	autophagosome assembly [GO:0000045]; autophagy...	NaN	W0TA43	SUBUNIT: Homodimer (By similarity). Dimerizati...	FUNCTION: Serine/threonine protein kinase invo...	NaN	MSSESHGKAVAKAIRLPTENYTVEKEIGKGSFAIVYKGVSLRDGRN...
5533	W7JX98_KGP_PLAFO_KD1	W7JX98	KGP_PLAFO	cGMP-dependent protein kinase (EC 2.7.11.12)	PKG PFNF54_05395	PKG	Plasmodium falciparum (isolate NF54)	Protein kinase	ECO:0000255\|PROSITE-ProRule:PRU00159	541	...	NaN	NaN	SUBCELLULAR LOCATION: Cytoplasm {ECO:0000250\|U...	protein phosphorylation [GO:0006468]	NaN	NaN	SUBUNIT: Monomer. {ECO:0000250\|UniProtKB:Q8I719}.	FUNCTION: Serine/threonine protein kinase whic...	ACTIVITY REGULATION: Activated by cGMP. Not ac...	MEEDDNLKKGNERNKKKAIFSNDDFTGEDSLMEDHLELREKLSEDI...
5534	X5M5N0_WNK_CAEEL_KD1	X5M5N0	WNK_CAEEL	Serine/threonine-protein kinase WNK (EC 2.7.11...	wnk-1 C46C2.1	wnk-1	Caenorhabditis elegans	Protein kinase	ECO:0000255\|PROSITE-ProRule:PRU00159	334	...	NaN	NaN	SUBCELLULAR LOCATION: Cytoplasm {ECO:0000250\|U...	cell volume homeostasis [GO:0006884]; cellular...	TISSUE SPECIFICITY: Expressed in pharynx, nerv...	G5EEN4	SUBUNIT: Interacts with gck-3 (via C-terminus)...	FUNCTION: Serine/threonine-protein kinase comp...	ACTIVITY REGULATION: Activated in response to ...	MPDSITNGGRPPAPPSSVSSTTASTTGNFGTRRRLVNRIKKVDELH...
5535	X5M8U1_GCY17_CAEEL_KD1	X5M8U1	GCY17_CAEEL	Receptor-type guanylate cyclase gcy-17 (EC 4.6...	gcy-17 W03F11.2	gcy-17	Caenorhabditis elegans	Protein kinase	ECO:0000255\|PROSITE-ProRule:PRU00159	535	...	R-CEL-2514859;	NaN	SUBCELLULAR LOCATION: Cell membrane {ECO:00003...	cGMP biosynthetic process [GO:0006182]; intrac...	TISSUE SPECIFICITY: Expressed in PHA sensory n...	NaN	NaN	FUNCTION: Guanylate cyclase involved in the pr...	NaN	MLFLRLFIFTPFLILANCQARRTIKVGLLFVQNVSSLQVGIGYRTS...

5536 rows × 27 columns

def get_fasta(df,seq_col='kd_seq',id_col='kd_ID',path='out.fasta'):
    "Generate fasta file from sequences."
    records = [
        SeqRecord(Seq(row[seq_col]), id=row[id_col], description="")
        for _, row in df.iterrows()
    ]
    SeqIO.write(records, path, "fasta")
    print(len(records))

get_fasta(kd,path='raw/kinase_domains.fasta')

For human only:

human = kd[kd.Organism=='Homo sapiens (Human)']

get_fasta(human,path="raw/human_kinase_domains.fasta")

for PI3K catalytic domain

pi3k = kd[kd.kd_note=='PI3K/PI4K catalytic']

get_fasta(pi3k,path="raw/pi3k_kinase_domains.fasta")

Run clustalo

sudo apt-get update
sudo apt-get install clustalo
clustalo -i kinase_domains.fasta -o kinase_domains.aln --force --outfmt=clu

import subprocess
subprocess.run([
    "clustalo", "-i", "kinase_domains.fasta", 
    "-o", "kinase_domains.aln", "--force", "--outfmt=clu"
])

Analyze alignment

def aln2df(fname):
    alignment = AlignIO.read(fname, "clustal")
    alignment_array = [list(str(record.seq)) for record in alignment]
    return pd.DataFrame(alignment_array)

alignment = AlignIO.read("raw/kinase_domains.aln", "clustal")

Turns in to dataframe:

alignment_array = [list(str(record.seq)) for record in alignment]
df = pd.DataFrame(alignment_array)

df.columns = df.columns+1

df.index=kd.kd_ID

# df.to_parquet('output/uniprot_kd_align.parquet')

df.head()

	1	2	3	4	5	6	7	8	9	10	...	3425	3426	3427	3428	3429	3430	3431	3432	3433	3434
kd_ID
A0A075F7E9_LERK1_ORYSI_KD1	-	-	-	-	-	-	-	-	-	-	...	-	-	-	-	-	-	-	-	-	-
A0A078BQP2_GCY25_CAEEL_KD1	-	-	-	-	-	-	-	-	-	-	...	-	-	-	-	-	-	-	-	-	-
A0A078CGE6_M3KE1_BRANA_KD1	-	-	-	-	-	-	-	-	-	-	...	-	-	-	-	-	-	-	-	-	-
A0A0G2K344_PK3CA_RAT_KD1	-	-	-	-	-	-	-	-	-	-	...	-	-	-	-	-	-	-	-	-	-
A0A0H2ZM62_HK06_STRP2_KD1	-	-	-	-	-	-	-	-	-	-	...	-	-	-	-	-	-	-	-	-	-

5 rows × 3434 columns

# Compute residue frequency at each position
counts_df = df.apply(lambda col: col.value_counts(), axis=0).fillna(0)
freq_df = counts_df.div(counts_df.sum(axis=0), axis=1)

freq_df.head()

	1	2	3	4	5	6	7	8	9	10	...	3425	3426	3427	3428	3429	3430	3431	3432	3433	3434
-	0.999819	0.999819	0.999819	0.999819	0.999819	0.999819	0.999819	0.999819	0.999819	0.999819	...	0.999819	0.999819	0.999819	0.999819	0.999819	0.999819	0.999819	0.999819	0.999819	0.999819
A	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
C	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
D	0.000000	0.000181	0.000000	0.000000	0.000181	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000181	0.000181	0.000000	0.000000	0.000000	0.000181	0.000000	0.000000	0.000000
E	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000181	0.000000	0.000000

5 rows × 3434 columns

# remove '-' first line
max_series=freq_df.iloc[1:,:]

freq_max = pd.concat([max_series.idxmax(),max_series.max()],axis=1)

freq_max.columns = ['aa','max_value']

freq_max = freq_max.sort_values('max_value',ascending=False).reset_index(names='position')

out = freq_max[freq_max.max_value>0.1]

# out.to_csv('out/align_freq_max_aa.csv',index=False)

out

	position	aa	max_value
0	1549	N	0.815390
1	2618	D	0.809429
2	1724	D	0.800759
3	1525	D	0.791004
4	1730	G	0.775470
...	...	...	...
214	193	E	0.101879
215	640	G	0.101879
216	922	L	0.101337
217	603	R	0.101156
218	2581	G	0.100614

219 rows × 3 columns

Locate active motif & save

1549 N is after HRD motif
2618 D is around D[IV]WS motif
1724 D is DFG motif
1525 D is HRD motif

kd['D1']=(df[1525]=='D').astype(int) # HRD

kd['D2'] = (df[1724]=='D').astype(int) #DFG

kd['D3']=(df[2618]=='D').astype(int)

kd['N1'] = (df[1549]=='N').astype(int)

active_col = ['D1','D2']

kd['active_D1_D2'] = (kd[active_col].sum(1)==len(active_col)).astype(int)

active_all = kd[kd.active_D1_D2==1]

active_all.shape

(4209, 32)

# active_all.to_excel('out/uniprot_kd_active_D1_D2.xlsx',index=False)

# kd.to_excel('out/uniprot_kd_motif_labeled.xlsx',index=False)

Take a look of their identity:

kd[kd.active_D1_D2==1].kd_note.value_counts()

kd_note
Protein kinase               4005
PI3K/PI4K catalytic            71
Protein kinase 2               61
Protein kinase 1               37
Histidine kinase               34
Alpha-type protein kinase       1
Name: count, dtype: int64

kd[kd.active_D1_D2==0].kd_note.value_counts()

kd_note
Histidine kinase                  591
Protein kinase                    525
PI3K/PI4K catalytic                97
Protein kinase 1                   36
Alpha-type protein kinase          21
Protein kinase; inactive           13
Protein kinase 2                   12
HWE histidine kinase domain        11
Guanylate kinase-like               5
Histidine kinase 2                  4
Histidine kinase 1                  4
Amino-acid kinase domain (AAK)      3
Kinase domain                       2
Histidine kinase; first part        1
Histidine kinase; second part       1
Protein kinase; truncated           1
Name: count, dtype: int64

Add KD info to human kinase info

kd = pd.read_excel('out/uniprot_kd_motif_labeled.xlsx')

human= kd[kd.Organism=='Homo sapiens (Human)']

kd_id_uniprot_map = human.groupby('Uniprot').agg({'kd_ID': lambda x: ','.join(x.unique()),
                                                  'active_D1_D2': lambda x: x,
                                                 })

# kd_id_uniprot_map.to_csv('raw/kd_ID_uniprot_map.csv')

Get the csv merge with kinase info. Manually match KD1 and KD2 to the kinase info by considering active_D1_D2.

The data is updated on github.