from katlas.core import *
import pandas as pd,seaborn as sns
Compare CDDM and PSPA in Tyr kinase
Load CDDM
= Data.get_cddm() cddm
= cddm.iloc[:,:-6] cddm
Load PSPA
= Data.get_pspa_tyr_norm().iloc[:,:-6] pspa
Match PSPA index to CDDM
= pd.read_csv('raw/lew_tyr_info.csv') pspa_info
pspa
-5P | -5G | -5A | -5C | -5S | -5T | -5V | -5I | -5L | -5M | ... | 5H | 5K | 5R | 5Q | 5N | 5D | 5E | 5s | 5t | 5y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
kinase | |||||||||||||||||||||
ABL1 | 0.0668 | 0.0689 | 0.0646 | 0.0520 | 0.0564 | 0.0539 | 0.0485 | 0.0448 | 0.0520 | 0.0536 | ... | 0.0613 | 0.0652 | 0.0756 | 0.0526 | 0.0512 | 0.0362 | 0.0339 | 0.0254 | 0.0254 | 0.0337 |
TNK2 | 0.0679 | 0.0818 | 0.0627 | 0.0617 | 0.0529 | 0.0528 | 0.0419 | 0.0463 | 0.0437 | 0.0453 | ... | 0.0499 | 0.0385 | 0.0302 | 0.0531 | 0.0465 | 0.0630 | 0.0572 | 0.0364 | 0.0364 | 0.0572 |
ALK | 0.0675 | 0.0640 | 0.0590 | 0.0511 | 0.0476 | 0.0422 | 0.0455 | 0.0514 | 0.0546 | 0.0543 | ... | 0.0448 | 0.0367 | 0.0489 | 0.0334 | 0.0387 | 0.0245 | 0.0226 | 0.0181 | 0.0181 | 0.0172 |
ABL2 | 0.0687 | 0.0715 | 0.0611 | 0.0448 | 0.0537 | 0.0513 | 0.0467 | 0.0398 | 0.0462 | 0.0505 | ... | 0.0566 | 0.0640 | 0.0779 | 0.0538 | 0.0565 | 0.0378 | 0.0381 | 0.0252 | 0.0252 | 0.0289 |
AXL | 0.0656 | 0.0753 | 0.0535 | 0.0525 | 0.0468 | 0.0467 | 0.0459 | 0.0538 | 0.0507 | 0.0542 | ... | 0.0441 | 0.0506 | 0.0355 | 0.0635 | 0.0696 | 0.0592 | 0.0559 | 0.0413 | 0.0413 | 0.0455 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
KDR | 0.0634 | 0.0672 | 0.0556 | 0.0517 | 0.0541 | 0.0526 | 0.0427 | 0.0420 | 0.0428 | 0.0476 | ... | 0.0543 | 0.0653 | 0.0771 | 0.0509 | 0.0582 | 0.0414 | 0.0387 | 0.0335 | 0.0335 | 0.0406 |
FLT4 | 0.0457 | 0.0531 | 0.0488 | 0.0553 | 0.0512 | 0.0471 | 0.0432 | 0.0499 | 0.0474 | 0.0530 | ... | 0.0624 | 0.0564 | 0.0559 | 0.0537 | 0.0610 | 0.0620 | 0.0528 | 0.0600 | 0.0600 | 0.0464 |
WEE1_TYR | 0.0531 | 0.0640 | 0.0559 | 0.0560 | 0.0433 | 0.0435 | 0.0568 | 0.0571 | 0.0637 | 0.0562 | ... | 0.0585 | 0.1058 | 0.1658 | 0.0447 | 0.0495 | 0.0312 | 0.0365 | 0.0453 | 0.0453 | 0.0490 |
YES1 | 0.0677 | 0.0571 | 0.0537 | 0.0530 | 0.0527 | 0.0505 | 0.0435 | 0.0375 | 0.0400 | 0.0463 | ... | 0.0593 | 0.0662 | 0.0840 | 0.0559 | 0.0604 | 0.0422 | 0.0482 | 0.0374 | 0.0374 | 0.0411 |
ZAP70 | 0.0602 | 0.0880 | 0.0623 | 0.0496 | 0.0471 | 0.0514 | 0.0465 | 0.0380 | 0.0307 | 0.0526 | ... | 0.0484 | 0.0477 | 0.0290 | 0.0520 | 0.0537 | 0.0709 | 0.0710 | 0.0862 | 0.0862 | 0.0605 |
93 rows × 230 columns
Get overlapped kinase
# non-overlapped
~pspa.index.isin(cddm.index)] pspa.index[
Index(['BMPR2_TYR', 'DDR1', 'ERBB2', 'LIMK1_TYR', 'LIMK2_TYR', 'MAP2K4_TYR',
'MAP2K6_TYR', 'MAP2K7_TYR', 'PKMYT1_TYR', 'NEK10_TYR', 'PDHK1_TYR',
'PDHK3_TYR', 'PDHK4_TYR', 'PINK1_TYR', 'TESK1_TYR', 'TNNI3K_TYR',
'WEE1_TYR'],
dtype='object', name='kinase')
= pspa.index[pspa.index.isin(cddm.index)]
overlap_kinase overlap_kinase
Index(['ABL1', 'TNK2', 'ALK', 'ABL2', 'AXL', 'BLK', 'PTK6', 'BTK', 'CSF1R',
'CSK', 'MATK', 'DDR2', 'EGFR', 'EPHA1', 'EPHA2', 'EPHA3', 'EPHA4',
'EPHA5', 'EPHA6', 'EPHA7', 'EPHA8', 'EPHB1', 'EPHB2', 'EPHB3', 'EPHB4',
'BMX', 'PTK2', 'FER', 'FES', 'FGFR1', 'FGFR2', 'FGFR3', 'FGFR4', 'FGR',
'FLT3', 'FRK', 'FYN', 'HCK', 'ERBB4', 'IGF1R', 'INSR', 'INSRR', 'ITK',
'JAK1', 'JAK2', 'JAK3', 'KIT', 'LCK', 'LTK', 'LYN', 'MERTK', 'MET',
'MST1R', 'MUSK', 'PDGFRA', 'PDGFRB', 'PTK2B', 'RET', 'ROS1', 'SRC',
'SRMS', 'SYK', 'TEC', 'TEK', 'TNK1', 'NTRK1', 'NTRK2', 'NTRK3', 'TXK',
'TYK2', 'TYRO3', 'FLT1', 'KDR', 'FLT4', 'YES1', 'ZAP70'],
dtype='object', name='kinase')
= pspa.columns overlap_column
= pspa.loc[overlap_kinase,overlap_column] pspa
= cddm.loc[overlap_kinase,overlap_column] cddm
Pearson
from matplotlib import pyplot as plt
= pspa.corrwith(cddm,axis=1).sort_values(ascending=False) d
=1).sort_values(ascending=False) pspa.corrwith(cddm,axis
kinase
BLK 0.422068
TNK2 0.418332
TXK 0.366034
LCK 0.364885
ABL2 0.355762
...
MATK 0.028732
FLT4 0.020456
PTK6 -0.001976
FES -0.006169
PDGFRA -0.040095
Length: 76, dtype: float64
set(rc={"figure.dpi":200, 'savefig.dpi':200})
sns.'notebook')
sns.set_context("ticks")
sns.set_style(
=1).sort_values(ascending=False).plot.bar(figsize=(15,3))
pspa.corrwith(cddm,axis'')
plt.xlabel('Pearson'); plt.ylabel(
Compare PSSM heatmaps
from matplotlib import pyplot as plt
from katlas.plot import *
import os, seaborn as sns
from tqdm import tqdm
from PIL import Image
def plot_kinase(df,kinase, title, fname=None, aa_order_paper = [i for i in 'PGACSTVILMFYWHKRQNDEsty']):
# get PSSM matrix
= get_one_kinase(df, kinase, drop_s = False).T.loc[aa_order_paper]
m
# plot heatmap
=title)
plot_heatmap(m,title
='tight', pad_inches=0.05) if fname else plt.show()
plt.savefig(fname,bbox_inches plt.close()
def plot_cor(k,title, fname=None):
# plot
'CDDM','PSPA')
plot_corr(cddm.loc[k],pspa.loc[k],
plt.title(title)
='tight', pad_inches=0.2) if fname else plt.show()
plt.savefig(fname,bbox_inches plt.close()
= pspa.corrwith(cddm,axis=1).sort_values(ascending=False).index kinase_order
kinase_order
Index(['BLK', 'TNK2', 'TXK', 'LCK', 'ABL2', 'TEC', 'ABL1', 'FYN', 'EPHA6',
'SRMS', 'EPHB4', 'DDR2', 'EPHA5', 'EPHB2', 'YES1', 'EPHB3', 'HCK',
'SYK', 'EPHA7', 'FGFR4', 'EPHA2', 'EPHA8', 'EPHA4', 'PTK2B', 'ITK',
'LYN', 'FRK', 'PTK2', 'CSF1R', 'JAK3', 'JAK1', 'SRC', 'LTK', 'EPHB1',
'AXL', 'MST1R', 'MERTK', 'EPHA1', 'FLT1', 'EGFR', 'FER', 'KIT', 'BMX',
'INSRR', 'FGFR3', 'JAK2', 'EPHA3', 'MET', 'BTK', 'FLT3', 'FGR', 'KDR',
'ERBB4', 'ZAP70', 'RET', 'TNK1', 'FGFR2', 'TYK2', 'FGFR1', 'ALK',
'PDGFRB', 'TEK', 'MUSK', 'TYRO3', 'INSR', 'NTRK3', 'ROS1', 'NTRK1',
'NTRK2', 'IGF1R', 'CSK', 'MATK', 'FLT4', 'PTK6', 'FES', 'PDGFRA'],
dtype='object', name='kinase')
# get count of KS pairs in CDDM
= Data.get_ks_dataset()
df
# Convert substrate names to uppercase
'SUB'] = df['substrate'].str.upper()
df[
# Remove duplicates based on kinase and substrate
= df.drop_duplicates(subset=['kinase_paper', 'SUB'])
df_unique
# Count unique substrates for each kinase
= df_unique.groupby('kinase_paper').size() cnt_unique
# Count number of substrates (with duplicates) for each kinase
= df.kinase_paper.value_counts() cnt_general
Uncheck below to run all
for k in tqdm(kinase_order):
# print('CDDM')
= f'{k} from KS datasets (n={cnt_unique[k]})')
plot_kinase(cddm,k,title
# print('PSPA')
= f'{k} from PSPA')
plot_kinase(pspa,k, title
= k)
plot_cor(k,title
break
0%| | 0/76 [00:00<?, ?it/s]
0%| | 0/76 [00:00<?, ?it/s]
To save, uncomment below
# for k in tqdm(kinase_order):
# # print('CDDM')
# plot_kinase(cddm,k,title= f'{k} from KS datasets (n={cnt_unique[k]})', fname=f'corr_tyr/CDDM/{k}.png')
# # print('PSPA')
# plot_kinase(pspa,k, title = f'{k} from PSPA', fname=f'corr_tyr/PSPA/{k}.png')
# plot_cor(k,title = k, fname=f'corr_tyr/pear/{k}.png')
# # break
Combine images
def combine_images_custom_layout(image_paths, output_path):
= [Image.open(image_path).convert('RGBA') for image_path in image_paths]
images
# Calculate total width and height for the new image
= max(images[0].width, images[1].width + images[2].width)
total_width = images[0].height + max(images[1].height, images[2].height)
total_height
# Create a new image with calculated dimensions
= Image.new('RGBA', (total_width, total_height))
combined_image
# Paste the first image at the top-center
= (total_width - images[0].width) // 2
x_offset 0], (x_offset, 0), images[0])
combined_image.paste(images[
# Paste the second image at the bottom-left
1], (0, images[0].height), images[1])
combined_image.paste(images[
# Paste the third image at the bottom-right
2], (images[1].width, images[0].height), images[2])
combined_image.paste(images[
# Save the result
combined_image.save(output_path)
Uncomment below to save combined figure for pdf
# folders = ["corr_tyr/pear",'corr_tyr/CDDM','corr_tyr/PSPA']
# for k in tqdm(kinase_order,total=len(kinase_order)):
# filename = f"{k}.png"
# image_paths = [os.path.join(folder, filename) for folder in folders]
# output_path = f"corr_tyr/combine/{k}.png"
# combine_images_custom_layout(image_paths, output_path)
# # break
Convert images to pdf
Correlation of amino acids DE and phospho-S/T/Y
import pandas as pd
from katlas.core import *
from katlas.plot import *
import seaborn as sns
from matplotlib import pyplot as plt
= Data.get_cddm()
cddm
= Data.get_pspa_tyr_norm().iloc[:,:-6] pspa
# remove dual
= pspa[pspa.index.str.split('_').str.len() ==1]
pspa
= cddm.index.intersection(pspa.index) overlap_kinase
=pspa.columns overlap_columns
= pspa.loc[overlap_kinase,overlap_columns]
pspa
= cddm.loc[overlap_kinase,overlap_columns]
cddm
= cddm.columns[cddm.columns.str.contains('D')].tolist()
D
= cddm.columns[cddm.columns.str.contains('E')].tolist()
E
= D+E
DE
max(1)
cddm[DE].
= cddm.columns[cddm.columns.str.contains('s')].tolist()
s = cddm.columns[cddm.columns.str.contains('t')].tolist()
t = cddm.columns[cddm.columns.str.contains('y')].tolist()
y
= s+t+y sty
set(rc={"figure.dpi":200, 'savefig.dpi':200})
sns.'notebook')
sns.set_context("ticks")
sns.set_style(
=(5,4))
plt.figure(figsize1), pspa[sty].max(1),'CDDM DE median', 'PSPA sty max')
plot_corr(cddm[DE].median(
=(5,4))
plt.figure(figsize1), pspa[sty].max(1),'CDDM DE median', 'PSPA sty max')
plot_corr(cddm[E].median(
=(5,4))
plt.figure(figsize1), pspa[y].max(1),'CDDM DE median', 'PSPA sty max')
plot_corr(cddm[E].median(
=(5,4))
plt.figure(figsize1), pspa[s].max(1),'CDDM DE median', 'PSPA sty max')
plot_corr(cddm[E].median(
=(5,4))
plt.figure(figsize1), pspa[t].max(1),'CDDM DE median', 'PSPA s/t max')
plot_corr(cddm[DE].median(
=(5,4))
plt.figure(figsize1), pspa[y].max(1),'CDDM E median', 'PSPA y max')
plot_corr(cddm[E].median(
=(5,4))
plt.figure(figsize1), pspa[sty].max(1),'CDDM DE median', 'PSPA sty max') plot_corr(cddm[D].median(
To save data, uncheck below
# data_list = [cddm[DE].median(1),
# cddm[D].median(1),
# cddm[E].median(1),
# pspa[t].max(1),
# pspa[y].max(1)]
# d = pd.concat(data_list,axis=1)
# d.columns=['CDDM_DE_median','CDDM_D_median','CDDM_E_median',
# 'PSPA_t_max','PSPA_y_max']
# d.to_csv('source/Fig4CD.csv')