Alignment

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO, AlignIO
import pandas as pd

Prepare fasta file for clustalo

kd=pd.read_excel('out/uniprot_kd.xlsx')
kd
kd_ID Uniprot Entry Name Protein names Gene Names Gene Names (primary) Organism kd_note kd_evidence kd_start ... Reactome ComplexPortal Subcellular location [CC] Gene Ontology (biological process) Tissue specificity Interacts with Subunit structure Function [CC] Activity regulation full_seq
0 A0A075F7E9_LERK1_ORYSI_KD1 A0A075F7E9 LERK1_ORYSI G-type lectin S-receptor-like serine/threonine... LECRK1 LECRK OsI_14840 LECRK1 Oryza sativa subsp. indica (Rice) Protein kinase ECO:0000255|PROSITE-ProRule:PRU00159 523 ... NaN NaN SUBCELLULAR LOCATION: Membrane {ECO:0000255}; ... defense response [GO:0006952]; response to oth... TISSUE SPECIFICITY: Expressed in plumules, rad... NaN SUBUNIT: Interacts (via kinase domain) with AD... FUNCTION: Involved in innate immunity. Require... NaN MVALLLFPMLLQLLSPTCAQTQKNITLGSTLAPQGPASSWLSPSGD...
1 A0A078BQP2_GCY25_CAEEL_KD1 A0A078BQP2 GCY25_CAEEL Receptor-type guanylate cyclase gcy-25 (EC 4.6... gcy-25 Y105C5B.2 gcy-25 Caenorhabditis elegans Protein kinase ECO:0000255|PROSITE-ProRule:PRU00159 464 ... R-CEL-2514859; NaN SUBCELLULAR LOCATION: Cell membrane {ECO:00003... cGMP biosynthetic process [GO:0006182]; intrac... TISSUE SPECIFICITY: Expressed in AQR, PQR and ... NaN NaN FUNCTION: Guanylate cyclase involved in the pr... NaN MLLLLLLLKISTFVDSFQIGHLEFENSNETRILEICMKNAGSWRDH...
2 A0A078CGE6_M3KE1_BRANA_KD1 A0A078CGE6 M3KE1_BRANA MAP3K epsilon protein kinase 1 (BnM3KE1) (EC 2... M3KE1 BnaA03g30290D GSBRNA2T00111755001 M3KE1 Brassica napus (Rape) Protein kinase ECO:0000255|PROSITE-ProRule:PRU00159 20 ... NaN NaN SUBCELLULAR LOCATION: Cytoplasm, cytoskeleton,... cell division [GO:0051301]; protein autophosph... TISSUE SPECIFICITY: Expressed in both the spor... NaN NaN FUNCTION: Serine/threonine-protein kinase invo... NaN MARQMTSSQFHKSKTLDNKYMLGDEIGKGAYGRVYIGLDLENGDFV...
3 A0A0G2K344_PK3CA_RAT_KD1 A0A0G2K344 PK3CA_RAT Phosphatidylinositol 4,5-bisphosphate 3-kinase... Pik3ca Pik3ca Rattus norvegicus (Rat) PI3K/PI4K catalytic ECO:0000255|PROSITE-ProRule:PRU00269 765 ... R-RNO-109704;R-RNO-112399;R-RNO-114604;R-RNO-1... NaN NaN actin cytoskeleton organization [GO:0030036]; ... TISSUE SPECIFICITY: Detected in the hypothalam... NaN SUBUNIT: Heterodimer of a catalytic subunit PI... FUNCTION: Phosphoinositide-3-kinase (PI3K) pho... NaN MPPRPSSGELWGIHLMPPRILVECLLPNGMIVTLECLREATLVTIK...
4 A0A0H2ZM62_HK06_STRP2_KD1 A0A0H2ZM62 HK06_STRP2 Sensor histidine protein kinase HK06 (EC 2.7.1... hk06 SPD_2019 hk06 Streptococcus pneumoniae serotype 2 (strain D3... Histidine kinase ECO:0000255|PROSITE-ProRule:PRU00107 239 ... NaN NaN SUBCELLULAR LOCATION: Cell membrane {ECO:00003... NaN NaN NaN NaN FUNCTION: Member of the two-component regulato... NaN MIKNPKLLTKSFLRSFAILGGVGLVIHIAIYLTFPFYYIQLEGEKF...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5531 W0LYS5_CAMKI_MACNP_KD1 W0LYS5 CAMKI_MACNP Calcium/calmodulin-dependent protein kinase ty... CaMKI CaMKI Macrobrachium nipponense (Oriental river shrim... Protein kinase ECO:0000255|PROSITE-ProRule:PRU00159 31 ... NaN NaN NaN molting cycle process [GO:0022404] TISSUE SPECIFICITY: Highly expressed in hepato... NaN NaN FUNCTION: Calcium/calmodulin-dependent protein... ACTIVITY REGULATION: Activated by Ca(2+)/calmo... MPLFGSKKETAKKSSKKDKDEGKMPAVEDKYILKDLLGTGAFSQVR...
5532 W0T9X4_ATG1_KLUMD_KD1 W0T9X4 ATG1_KLUMD Serine/threonine-protein kinase ATG1 (EC 2.7.1... ATG1 KLMA_30321 ATG1 Kluyveromyces marxianus (strain DMKU3-1042 / B... Protein kinase ECO:0000255|PROSITE-ProRule:PRU00159 21 ... NaN NaN SUBCELLULAR LOCATION: Cytoplasm {ECO:0000250|U... autophagosome assembly [GO:0000045]; autophagy... NaN W0TA43 SUBUNIT: Homodimer (By similarity). Dimerizati... FUNCTION: Serine/threonine protein kinase invo... NaN MSSESHGKAVAKAIRLPTENYTVEKEIGKGSFAIVYKGVSLRDGRN...
5533 W7JX98_KGP_PLAFO_KD1 W7JX98 KGP_PLAFO cGMP-dependent protein kinase (EC 2.7.11.12) PKG PFNF54_05395 PKG Plasmodium falciparum (isolate NF54) Protein kinase ECO:0000255|PROSITE-ProRule:PRU00159 541 ... NaN NaN SUBCELLULAR LOCATION: Cytoplasm {ECO:0000250|U... protein phosphorylation [GO:0006468] NaN NaN SUBUNIT: Monomer. {ECO:0000250|UniProtKB:Q8I719}. FUNCTION: Serine/threonine protein kinase whic... ACTIVITY REGULATION: Activated by cGMP. Not ac... MEEDDNLKKGNERNKKKAIFSNDDFTGEDSLMEDHLELREKLSEDI...
5534 X5M5N0_WNK_CAEEL_KD1 X5M5N0 WNK_CAEEL Serine/threonine-protein kinase WNK (EC 2.7.11... wnk-1 C46C2.1 wnk-1 Caenorhabditis elegans Protein kinase ECO:0000255|PROSITE-ProRule:PRU00159 334 ... NaN NaN SUBCELLULAR LOCATION: Cytoplasm {ECO:0000250|U... cell volume homeostasis [GO:0006884]; cellular... TISSUE SPECIFICITY: Expressed in pharynx, nerv... G5EEN4 SUBUNIT: Interacts with gck-3 (via C-terminus)... FUNCTION: Serine/threonine-protein kinase comp... ACTIVITY REGULATION: Activated in response to ... MPDSITNGGRPPAPPSSVSSTTASTTGNFGTRRRLVNRIKKVDELH...
5535 X5M8U1_GCY17_CAEEL_KD1 X5M8U1 GCY17_CAEEL Receptor-type guanylate cyclase gcy-17 (EC 4.6... gcy-17 W03F11.2 gcy-17 Caenorhabditis elegans Protein kinase ECO:0000255|PROSITE-ProRule:PRU00159 535 ... R-CEL-2514859; NaN SUBCELLULAR LOCATION: Cell membrane {ECO:00003... cGMP biosynthetic process [GO:0006182]; intrac... TISSUE SPECIFICITY: Expressed in PHA sensory n... NaN NaN FUNCTION: Guanylate cyclase involved in the pr... NaN MLFLRLFIFTPFLILANCQARRTIKVGLLFVQNVSSLQVGIGYRTS...

5536 rows × 27 columns

def get_fasta(df,seq_col='kd_seq',id_col='kd_ID',path='out.fasta'):
    "Generate fasta file from sequences."
    records = [
        SeqRecord(Seq(row[seq_col]), id=row[id_col], description="")
        for _, row in df.iterrows()
    ]
    SeqIO.write(records, path, "fasta")
    print(len(records))
get_fasta(kd,path='raw/kinase_domains.fasta')
5536

For human only:

human = kd[kd.Organism=='Homo sapiens (Human)']
get_fasta(human,path="raw/human_kinase_domains.fasta")
539

for PI3K catalytic domain

pi3k = kd[kd.kd_note=='PI3K/PI4K catalytic']
get_fasta(pi3k,path="raw/pi3k_kinase_domains.fasta")
168

Run clustalo

sudo apt-get update
sudo apt-get install clustalo
clustalo -i kinase_domains.fasta -o kinase_domains.aln --force --outfmt=clu
# import subprocess
# subprocess.run([
#     "clustalo", "-i", "kinase_domains.fasta", 
#     "-o", "kinase_domains.aln", "--force", "--outfmt=clu"
# ])

Analyze alignment

alignment = AlignIO.read("raw/kinase_domains.aln", "clustal")

Turns in to dataframe:

alignment_array = [list(str(record.seq)) for record in alignment]
df = pd.DataFrame(alignment_array)
df.columns = df.columns+1
df.index=kd.kd_ID
# df.to_parquet('output/uniprot_kd_align.parquet')
df.head()
1 2 3 4 5 6 7 8 9 10 ... 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434
kd_ID
A0A075F7E9_LERK1_ORYSI_KD1 - - - - - - - - - - ... - - - - - - - - - -
A0A078BQP2_GCY25_CAEEL_KD1 - - - - - - - - - - ... - - - - - - - - - -
A0A078CGE6_M3KE1_BRANA_KD1 - - - - - - - - - - ... - - - - - - - - - -
A0A0G2K344_PK3CA_RAT_KD1 - - - - - - - - - - ... - - - - - - - - - -
A0A0H2ZM62_HK06_STRP2_KD1 - - - - - - - - - - ... - - - - - - - - - -

5 rows × 3434 columns

# Compute residue frequency at each position
counts_df = df.apply(lambda col: col.value_counts(), axis=0).fillna(0)
freq_df = counts_df.div(counts_df.sum(axis=0), axis=1)
freq_df.head()
1 2 3 4 5 6 7 8 9 10 ... 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434
- 0.999819 0.999819 0.999819 0.999819 0.999819 0.999819 0.999819 0.999819 0.999819 0.999819 ... 0.999819 0.999819 0.999819 0.999819 0.999819 0.999819 0.999819 0.999819 0.999819 0.999819
A 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
C 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
D 0.000000 0.000181 0.000000 0.000000 0.000181 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000181 0.000181 0.000000 0.000000 0.000000 0.000181 0.000000 0.000000 0.000000
E 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000181 0.000000 0.000000

5 rows × 3434 columns

# remove '-' first line
max_series=freq_df.iloc[1:,:]
freq_max = pd.concat([max_series.idxmax(),max_series.max()],axis=1)
freq_max.columns = ['aa','max_value']
freq_max = freq_max.sort_values('max_value',ascending=False).reset_index(names='position')
out = freq_max[freq_max.max_value>0.1]
# out.to_csv('out/align_freq_max_aa.csv',index=False)
out
position aa max_value
0 1549 N 0.815390
1 2618 D 0.809429
2 1724 D 0.800759
3 1525 D 0.791004
4 1730 G 0.775470
... ... ... ...
214 193 E 0.101879
215 640 G 0.101879
216 922 L 0.101337
217 603 R 0.101156
218 2581 G 0.100614

219 rows × 3 columns

Locate active motif & save

  • 1549 N is after HRD motif
  • 2618 D is around D[IV]WS motif
  • 1724 D is DFG motif
  • 1525 D is HRD motif
kd['D1']=(df[1525]=='D').astype(int) # HRD
kd['D2'] = (df[1724]=='D').astype(int) #DFG
kd['D3']=(df[2618]=='D').astype(int)
kd['N1'] = (df[1549]=='N').astype(int)
active_col = ['D1','D2']
kd['active_D1_D2'] = (kd[active_col].sum(1)==len(active_col)).astype(int)
active_all = kd[kd.active_D1_D2==1]
active_all.shape
(4209, 32)
# active_all.to_excel('out/uniprot_kd_active_D1_D2.xlsx',index=False)
# kd.to_excel('out/uniprot_kd_motif_labeled.xlsx',index=False)

Take a look of their identity:

kd[kd.active_D1_D2==1].kd_note.value_counts()
kd_note
Protein kinase               4005
PI3K/PI4K catalytic            71
Protein kinase 2               61
Protein kinase 1               37
Histidine kinase               34
Alpha-type protein kinase       1
Name: count, dtype: int64
kd[kd.active_D1_D2==0].kd_note.value_counts()
kd_note
Histidine kinase                  591
Protein kinase                    525
PI3K/PI4K catalytic                97
Protein kinase 1                   36
Alpha-type protein kinase          21
Protein kinase; inactive           13
Protein kinase 2                   12
HWE histidine kinase domain        11
Guanylate kinase-like               5
Histidine kinase 2                  4
Histidine kinase 1                  4
Amino-acid kinase domain (AAK)      3
Kinase domain                       2
Histidine kinase; first part        1
Histidine kinase; second part       1
Protein kinase; truncated           1
Name: count, dtype: int64

Add KD info to human kinase info

kd = pd.read_excel('out/uniprot_kd_motif_labeled.xlsx')
human= kd[kd.Organism=='Homo sapiens (Human)']
kd_id_uniprot_map = human.groupby('Uniprot').agg({'kd_ID': lambda x: ','.join(x.unique()),
                                                  'active_D1_D2': lambda x: x,
                                                 })
# kd_id_uniprot_map.to_csv('raw/kd_ID_uniprot_map.csv')

Get the csv merge with kinase info. Manually match KD1 and KD2 to the kinase info by considering active_D1_D2.

The data is updated on github.