Plot

Functions to plot motif logo, heatmap, scatter plot, and others.

Setup

Utils

set_sns

 set_sns ()

Set seaborn resolution for notebook display

source

get_color_dict

 get_color_dict (categories, palette:str='tab20')

Assign colors to a list of names (allow duplicates), returns a dictionary of unique name with corresponding color

	Type	Default	Details
categories			list of names to assign color
palette	str	tab20	choose from sns.color_palette

get_color_dict(['a','a','b'])

{'a': (0.6823529411764706, 0.7803921568627451, 0.9098039215686274),
 'b': (1.0, 0.4980392156862745, 0.054901960784313725)}

Heatmap

# get kinase-substrate dataset
ks = Data.get_ks_dataset()

ks_k = ks[ks.kinase_uniprot=='P00519']

pssm_df = get_prob(ks_k,'site_seq')
pssm_df.head()

Position	-20	-19	-18	-17	-16	-15	-14	-13	-12	-11	...	11	12	13	14	15	16	17	18	19	20
aa
P	0.050061	0.048691	0.062349	0.055489	0.046988	0.054753	0.064787	0.055090	0.056683	0.048272	...	0.052728	0.051140	0.069436	0.063164	0.057716	0.056639	0.051072	0.050697	0.052163	0.060703
G	0.080586	0.080341	0.069007	0.067551	0.082530	0.070397	0.093581	0.073054	0.077566	0.072706	...	0.099939	0.070856	0.071916	0.075672	0.071518	0.064821	0.080076	0.088720	0.062341	0.090735
A	0.080586	0.080341	0.062954	0.054282	0.075301	0.071600	0.070186	0.070060	0.065632	0.070322	...	0.064378	0.077634	0.069436	0.072545	0.063363	0.079924	0.088272	0.087452	0.057888	0.070927
C	0.017094	0.012781	0.013317	0.019903	0.012048	0.017449	0.007798	0.014371	0.013126	0.012515	...	0.007357	0.017868	0.014879	0.012508	0.011920	0.018880	0.019546	0.014575	0.019084	0.014058
S	0.047619	0.035910	0.046610	0.030157	0.037349	0.042720	0.041992	0.041916	0.034010	0.039333	...	0.024525	0.036352	0.047117	0.040025	0.042033	0.040277	0.039092	0.051965	0.041349	0.039617

5 rows × 41 columns

@delegates(sns.heatmap)
def plot_heatmap_simple(matrix, # a matrix of values
                 title: str='heatmap', # title of the heatmap
                 figsize: tuple=(6,7), # (width, height)
                 cmap: str='binary', # color map, default is dark&white
                 **kwargs, # arguments for sns.heatmap()
                 ):
    
    "Plot heatmap based on a matrix of values"
    
    plt.figure(figsize=figsize)
    sns.heatmap(matrix, square=True,cmap=cmap, annot=False,**kwargs)
    plt.title(title)
    plt.ylabel('')
    plt.xlabel('')
    plt.yticks(rotation=0)

plot_heatmap_simple(pssm_df,'kinase')

source

plot_heatmap

 plot_heatmap (heatmap_df, ax=None, position_label=True, figsize=(5, 6),
               include_zero=True)

Plots a heatmap with specific formatting.

plot_heatmap(pssm_df);
# plt.savefig('plot.svg',bbox_inches='tight')

Logo

From frequency data

source

change_center_name

 change_center_name (df)

Transfer the middle pS,pT,pY to S,T,Y for plot.

source

scale_zero_position

 scale_zero_position (logo_df)

source

get_logo_df

 get_logo_df (prob_df, scale_zero=True)

source

plot_logo

 plot_logo (prob_df, title='Motif', scale_zero=True, ax=None, figsize=(6,
            1))

plot_logo(pssm_df,scale_zero=False)

Set scale_zero to default True can have better vision of the side amino acids

plot_logo(pssm_df)

source

extract_motifs

 extract_motifs (df, cluster_col, seq_col='site_seq', count_thr=10,
                 valid_thr=None, plot=False)

Extract motifs from clusters in a dataframe

source

plot_logo_heatmap

 plot_logo_heatmap (pssm_df, title='Motif', figsize=(7, 8),
                    include_zero=False)

Plot logo and heatmap vertically

	Type	Default	Details
pssm_df			column is position, index is aa
title	str	Motif
figsize	tuple	(7, 8)
include_zero	bool	False

plot_logo_heatmap(pssm_df)

Plot PSPA data logo

source

plot_logo_raw

 plot_logo_raw (logo_df, ax=None, title='Motif', ytitle='Enrichment',
                figsize=(6, 2))

source

get_logo

 get_logo (df:pandas.core.frame.DataFrame, kinase:str)

Given stacked df (index as kinase, columns as substrates), get a specific kinase’s logo

	Type	Details
df	DataFrame	stacked Dataframe with kinase as index, substrates as columns
kinase	str	a specific kinase name in index

This function is to replicate the motif logo from Johnson et al. Nature: An atlas of substrate specificities for the human serine/threonine kinome. Given raw PSPA data, it can output a motif logo.

# load raw PSPA data
df = pd.read_csv('https://github.com/sky1ove/katlas_raw/raw/refs/heads/main/nbs/raw/pspa_st_raw.csv').set_index('kinase')
df.head()

	-5P	-5G	-5A	-5C	-5S	-5T	-5V	-5I	-5L	-5M	...	4H	4K	4R	4Q	4N	4D	4E	4s	4t	4y
kinase
AAK1	7614134.38	2590563.43	3001315.49	4696631.43	4944311.77	8315837.72	10056545.00	16433061.43	10499735.53	9133577.86	...	6020662.73	8938081.41	9983402.01	6833481.55	6364453.29	4189045.89	4921595.57	2705053.53	2705053.53	2909279.71
ACVR2A	4991039.28	5783855.86	7015770.78	8367603.09	7072052.48	7601399.57	7188292.41	7513915.73	7159894.71	6266122.81	...	6039472.76	5556300.56	5178734.62	6490097.70	5862480.97	6742905.78	6750653.36	7414220.16	7414220.16	6209576.97
ACVR2B	26480329.10	25689687.16	28137300.90	45175909.30	32876722.90	33516959.03	27011194.06	21996255.94	23412987.54	25670581.40	...	27984195.21	22496915.32	24236904.72	29132857.30	26527389.14	36388726.15	34729319.54	37906081.09	37906081.09	31761418.56
AKT1	18399509.29	18104681.05	16831835.48	17247743.90	22647275.57	17801288.32	13037570.99	13271896.32	14156489.52	15409761.84	...	29511541.69	50942663.29	48152924.11	32693882.62	28896602.57	19701350.30	13887460.52	17483074.60	17483074.60	11696833.54
AKT2	5439237.54	5569477.23	5805462.70	6301076.01	5004932.12	4812022.80	3906822.27	3776845.45	4450344.85	4629319.80	...	6812201.58	11590683.50	9932525.89	6544476.93	6252360.75	3629091.99	3510048.19	5499662.30	5499662.30	4188620.88

5 rows × 207 columns

# plot logo of a kinase
get_logo(df, 'AAK1')

Rank

/opt/hostedtoolcache/Python/3.10.17/x64/lib/python3.10/site-packages/fastcore/docscrape.py:230: UserWarning: Unknown section See Also
  else: warn(msg)

source

plot_rank

 plot_rank (sorted_df:pandas.core.frame.DataFrame, x:str, y:str,
            n_hi:int=10, n_lo:int=10, figsize:tuple=(10, 8), data=None,
            hue=None, size=None, style=None, palette=None, hue_order=None,
            hue_norm=None, sizes=None, size_order=None, size_norm=None,
            markers=True, style_order=None, legend='auto', ax=None)

Plot rank from a sorted dataframe

	Type	Default	Details
sorted_df	DataFrame		a sorted dataframe
x	str		column name for x axis
y	str		column name for y aixs
n_hi	int	10	if not None, show the head n names
n_lo	int	10	if not None, show the tail n names
figsize	tuple	(10, 8)	figure size
data	NoneType	None	Input data structure. Either a long-form collection of vectors that can be assigned to named variables or a wide-form dataset that will be internally reshaped.
hue	NoneType	None	Grouping variable that will produce points with different colors. Can be either categorical or numeric, although color mapping will behave differently in latter case.
size	NoneType	None	Grouping variable that will produce points with different sizes. Can be either categorical or numeric, although size mapping will behave differently in latter case.
style	NoneType	None	Grouping variable that will produce points with different markers. Can have a numeric dtype but will always be treated as categorical.
palette	NoneType	None	Method for choosing the colors to use when mapping the `hue` semantic. String values are passed to :func:`color_palette`. List or dict values imply categorical mapping, while a colormap object implies numeric mapping.
hue_order	NoneType	None	Specify the order of processing and plotting for categorical levels of the `hue` semantic.
hue_norm	NoneType	None	Either a pair of values that set the normalization range in data units or an object that will map from data units into a [0, 1] interval. Usage implies numeric mapping.
sizes	NoneType	None	An object that determines how sizes are chosen when `size` is used. List or dict arguments should provide a size for each unique data value, which forces a categorical interpretation. The argument may also be a min, max tuple.
size_order	NoneType	None	Specified order for appearance of the `size` variable levels, otherwise they are determined from the data. Not relevant when the `size` variable is numeric.
size_norm	NoneType	None	Normalization in data units for scaling plot objects when the `size` variable is numeric.
markers	bool	True	Object determining how to draw the markers for different levels of the `style` variable. Setting to `True` will use default markers, or you can pass a list of markers or a dictionary mapping levels of the `style` variable to markers. Setting to `False` will draw marker-less lines. Markers are specified as in matplotlib.
style_order	NoneType	None	Specified order for appearance of the `style` variable levels otherwise they are determined from the data. Not relevant when the `style` variable is numeric.
legend	str	auto	How to draw the legend. If “brief”, numeric `hue` and `size` variables will be represented with a sample of evenly spaced values. If “full”, every group will get an entry in the legend. If “auto”, choose between brief or full representation based on number of levels. If `False`, no legend data is added and no legend is drawn.
ax	NoneType	None	Pre-existing axes for the plot. Otherwise, call :func:`matplotlib.pyplot.gca` internally.
Returns	:class:`matplotlib.axes.Axes`		The matplotlib axes containing the plot.

# load data
# df = Data.get_pspa_raw().set_index('kinase')
df = pd.read_csv('https://github.com/sky1ove/katlas_raw/raw/refs/heads/main/nbs/raw/pspa_st_raw.csv').set_index('kinase')


# get sorted dataframe
sorted_df = df.max(1).reset_index(name='values').sort_values('values')
sorted_df.head()

	kinase	values
68	CK1G2	189898.392
294	VRK2	4191709.640
8	ALPHAK3	4573611.730
249	PRPK	8495330.790
38	CAMLCK	9413689.600

plot_rank(sorted_df,x='kinase',y='values')
plt.xlabel('kinase');

Distribution histogram

source

plot_hist

 plot_hist (df:pandas.core.frame.DataFrame, x:str, figsize:tuple=(6, 2),
            data=None, y=None, hue=None, weights=None, stat='count',
            bins='auto', binwidth=None, binrange=None, discrete=None,
            cumulative=False, common_bins=True, common_norm=True,
            multiple='layer', element='bars', fill=True, shrink=1,
            kde=False, kde_kws=None, line_kws=None, thresh=0,
            pthresh=None, pmax=None, cbar=False, cbar_ax=None,
            cbar_kws=None, palette=None, hue_order=None, hue_norm=None,
            color=None, log_scale=None, legend=True, ax=None)

	Type	Default	Details
df	DataFrame		a dataframe that contain values for plot
x	str		column name of values
figsize	tuple	(6, 2)
data	NoneType	None	Input data structure. Either a long-form collection of vectors that can be assigned to named variables or a wide-form dataset that will be internally reshaped.
y	NoneType	None
hue	NoneType	None	Semantic variable that is mapped to determine the color of plot elements.
weights	NoneType	None	Vector variables
stat	str	count	Aggregate statistic to compute in each bin. - `count`: show the number of observations in each bin - `frequency`: show the number of observations divided by the bin width - `probability` or `proportion`: normalize such that bar heights sum to 1 - `percent`: normalize such that bar heights sum to 100 - `density`: normalize such that the total area of the histogram equals 1
bins	str	auto	Generic bin parameter that can be the name of a reference rule, the number of bins, or the breaks of the bins. Passed to :func:`numpy.histogram_bin_edges`.
binwidth	NoneType	None	Width of each bin, overrides `bins` but can be used with `binrange`.
binrange	NoneType	None	Histogram computation parameters
discrete	NoneType	None	If True, default to `binwidth=1` and draw the bars so that they are centered on their corresponding data points. This avoids “gaps” that may otherwise appear when using discrete (integer) data.
cumulative	bool	False	If True, plot the cumulative counts as bins increase.
common_bins	bool	True	If True, use the same bins when semantic variables produce multiple plots. If using a reference rule to determine the bins, it will be computed with the full dataset.
common_norm	bool	True	If True and using a normalized statistic, the normalization will apply over the full dataset. Otherwise, normalize each histogram independently.
multiple	str	layer	Approach to resolving multiple elements when semantic mapping creates subsets. Only relevant with univariate data.
element	str	bars	Visual representation of the histogram statistic. Only relevant with univariate data.
fill	bool	True	If True, fill in the space under the histogram. Only relevant with univariate data.
shrink	int	1	Histogram appearance parameters
kde	bool	False	If True, compute a kernel density estimate to smooth the distribution and show on the plot as (one or more) line(s). Only relevant with univariate data.
kde_kws	NoneType	None	Parameters that control the KDE computation, as in :func:`kdeplot`.
line_kws	NoneType	None	Histogram smoothing with a kernel density estimate
thresh	int	0	Cells with a statistic less than or equal to this value will be transparent. Only relevant with bivariate data.
pthresh	NoneType	None	Like `thresh`, but a value in [0, 1] such that cells with aggregate counts (or other statistics, when used) up to this proportion of the total will be transparent.
pmax	NoneType	None	A value in [0, 1] that sets that saturation point for the colormap at a value such that cells below constitute this proportion of the total count (or other statistic, when used).
cbar	bool	False	If True, add a colorbar to annotate the color mapping in a bivariate plot. Note: Does not currently support plots with a `hue` variable well.
cbar_ax	NoneType	None	Pre-existing axes for the colorbar.
cbar_kws	NoneType	None	Bivariate histogram parameters
palette	NoneType	None	Method for choosing the colors to use when mapping the `hue` semantic. String values are passed to :func:`color_palette`. List or dict values imply categorical mapping, while a colormap object implies numeric mapping.
hue_order	NoneType	None	Specify the order of processing and plotting for categorical levels of the `hue` semantic.
hue_norm	NoneType	None	Either a pair of values that set the normalization range in data units or an object that will map from data units into a [0, 1] interval. Usage implies numeric mapping.
color	NoneType	None	Hue mapping parameters
log_scale	NoneType	None	Set axis scale(s) to log. A single value sets the data axis for any numeric axes in the plot. A pair of values sets each axis independently. Numeric values are interpreted as the desired base (default 10). When `None` or `False`, seaborn defers to the existing Axes scale.
legend	bool	True	If False, suppress the legend for semantic variables.
ax	NoneType	None	Axes information
Returns	:class:`matplotlib.axes.Axes`		The matplotlib axes containing the plot.

# we can use the same df
sorted_df.head()

	kinase	values
68	CK1G2	189898.392
294	VRK2	4191709.640
8	ALPHAK3	4573611.730
249	PRPK	8495330.790
38	CAMLCK	9413689.600

plot_hist(sorted_df,'values')

Scatter plot

source

plot_2d

 plot_2d (X:pandas.core.frame.DataFrame, data=None, x=None, y=None,
          hue=None, size=None, style=None, palette=None, hue_order=None,
          hue_norm=None, sizes=None, size_order=None, size_norm=None,
          markers=True, style_order=None, legend='auto', ax=None)

Make 2D plot from a dataframe that has first column to be x, and second column to be y

	Type	Default	Details
X	DataFrame		a dataframe that has first column to be x, and second column to be y
data	NoneType	None	Input data structure. Either a long-form collection of vectors that can be assigned to named variables or a wide-form dataset that will be internally reshaped.
x	NoneType	None
y	NoneType	None
hue	NoneType	None	Grouping variable that will produce points with different colors. Can be either categorical or numeric, although color mapping will behave differently in latter case.
size	NoneType	None	Grouping variable that will produce points with different sizes. Can be either categorical or numeric, although size mapping will behave differently in latter case.
style	NoneType	None	Grouping variable that will produce points with different markers. Can have a numeric dtype but will always be treated as categorical.
palette	NoneType	None	Method for choosing the colors to use when mapping the `hue` semantic. String values are passed to :func:`color_palette`. List or dict values imply categorical mapping, while a colormap object implies numeric mapping.
hue_order	NoneType	None	Specify the order of processing and plotting for categorical levels of the `hue` semantic.
hue_norm	NoneType	None	Either a pair of values that set the normalization range in data units or an object that will map from data units into a [0, 1] interval. Usage implies numeric mapping.
sizes	NoneType	None	An object that determines how sizes are chosen when `size` is used. List or dict arguments should provide a size for each unique data value, which forces a categorical interpretation. The argument may also be a min, max tuple.
size_order	NoneType	None	Specified order for appearance of the `size` variable levels, otherwise they are determined from the data. Not relevant when the `size` variable is numeric.
size_norm	NoneType	None	Normalization in data units for scaling plot objects when the `size` variable is numeric.
markers	bool	True	Object determining how to draw the markers for different levels of the `style` variable. Setting to `True` will use default markers, or you can pass a list of markers or a dictionary mapping levels of the `style` variable to markers. Setting to `False` will draw marker-less lines. Markers are specified as in matplotlib.
style_order	NoneType	None	Specified order for appearance of the `style` variable levels otherwise they are determined from the data. Not relevant when the `style` variable is numeric.
legend	str	auto	How to draw the legend. If “brief”, numeric `hue` and `size` variables will be represented with a sample of evenly spaced values. If “full”, every group will get an entry in the legend. If “auto”, choose between brief or full representation based on number of levels. If `False`, no legend data is added and no legend is drawn.
ax	NoneType	None	Pre-existing axes for the plot. Otherwise, call :func:`matplotlib.pyplot.gca` internally.
Returns	:class:`matplotlib.axes.Axes`		The matplotlib axes containing the plot.

plot_2d(_.iloc[:,:2])

Dimension reduction - 2d plot

source

plot_cluster

 plot_cluster (df:pandas.core.frame.DataFrame, method:str='pca',
               hue:str=None, complexity:int=30, palette:str='tab20',
               legend:bool=False, name_list=None, seed:int=123, s:int=50,
               **kwargs)

Given a dataframe of values, plot it in 2D. The method could be ‘pca’, ‘tsne’, or ‘umap’.

	Type	Default	Details
df	DataFrame		a dataframe of values that is waited for dimensionality reduction
method	str	pca	dimensionality reduction method, choose from pca, umap, and tsne
hue	str	None	colname of color
complexity	int	30	recommend 30 for tsne, 15 for umap, none for pca
palette	str	tab20	color scheme, could be tab10 if less categories
legend	bool	False	whether or not add the legend on the side
name_list	NoneType	None	a list of names to annotate each dot in the plot
seed	int	123	seed for dimensionality reduction
s	int	50	size of the dot
kwargs	VAR_KEYWORD

# load data
aa = Data.get_aa_info()
aa_rdkit = get_rdkit_df(aa, 'SMILES') # get rdkit features from SMILES columns
aa_rdkit = preprocess(aa_rdkit) # remove similar columns
info=Data.get_aa_info()

removing columns: {'fr_C_S', 'fr_lactam', 'SlogP_VSA7', 'fr_Nhpyrrole', 'fr_pyridine', 'fr_benzene', 'SlogP_VSA10', 'fr_ketone_Topliss', 'fr_dihydropyridine', 'fr_nitro', 'NumSaturatedCarbocycles', 'fr_C_O_noCOO', 'fr_oxime', 'fr_thiazole', 'fr_Ar_COO', 'fr_Imine', 'fr_urea', 'PEOE_VSA13', 'fr_furan', 'fr_ketone', 'SlogP_VSA9', 'PMI3', 'fr_aniline', 'fr_isothiocyan', 'fr_guanido', 'fr_piperdine', 'fr_ether', 'fr_Ndealkylation1', 'fr_nitrile', 'HeavyAtomCount', 'fr_phos_ester', 'fr_azo', 'fr_diazo', 'MaxPartialCharge', 'fr_prisulfonamd', 'fr_methoxy', 'fr_nitroso', 'Eccentricity', 'fr_nitro_arom', 'fr_piperzine', 'fr_quatN', 'fr_phenol', 'Chi0', 'SMR_VSA2', 'fr_HOCCN', 'PEOE_VSA5', 'fr_isocyan', 'fr_Ar_NH', 'NumAliphaticCarbocycles', 'fr_amidine', 'fr_ester', 'fr_Al_OH_noTert', 'fr_term_acetylene', 'fr_ArN', 'BCUT2D_MRHI', 'fr_amide', 'NumAliphaticRings', 'fr_barbitur', 'fr_Ar_OH', 'fr_alkyl_halide', 'fr_azide', 'LabuteASA', 'fr_hdrzone', 'fr_nitro_arom_nonortho', 'fr_aldehyde', 'fr_bicyclic', 'fr_phenol_noOrthoHbond', 'fr_thiophene', 'ExactMolWt', 'fr_para_hydroxylation', 'fr_thiocyan', 'HeavyAtomMolWt', 'fr_COO', 'MaxEStateIndex', 'fr_allylic_oxid', 'fr_benzodiazepine', 'fr_lactone', 'NumValenceElectrons', 'NumSpiroAtoms', 'SlogP_VSA12', 'Asphericity', 'NumBridgeheadAtoms', 'fr_alkyl_carbamate', 'NumAmideBonds', 'MolMR', 'SMR_VSA8', 'fr_oxazole', 'SlogP_VSA6', 'NumSaturatedRings', 'fr_phos_acid', 'fr_halogen', 'fr_epoxide', 'fr_imide', 'MinAbsPartialCharge', 'SlogP_VSA11', 'fr_tetrazole', 'Chi1n', 'fr_COO2', 'fr_morpholine', 'fr_N_O', 'fr_sulfonamd', 'fr_sulfone', 'fr_aryl_methyl', 'NumRadicalElectrons', 'fr_hdrzine', 'fr_Ndealkylation2', 'EState_VSA11', 'VSA_EState1', 'NumSaturatedHeterocycles'}
removing columns: set()

plot_cluster(aa_rdkit, name_list = aa.Name.tolist(), hue = 'aa')

Bokeh interactive plot

source

plot_bokeh

 plot_bokeh (X:pandas.core.frame.DataFrame, idx, hue:None, s:int=3,
             **kwargs)

Make interactive 2D plot with a searching box and window of dot information when pointing

	Type	Default	Details
X	DataFrame		a dataframe of two columns from dimensionality reduction
idx			pd.Series or list that indicates identities for searching box
hue	None		pd.Series or list that indicates category for each sample
s	int	3	dot size
kwargs	VAR_KEYWORD

# PCA reduce dimension to 2
X = reduce_feature(aa_rdkit)

# get info
info=Data.get_aa_info()

#plot
plot_bokeh(X, 
           idx = info.Name,
           hue = info.Name,
           s=7, 
           smiles = info.SMILES)

Loading BokehJS ...

Bar graph

source

plot_count

 plot_count (cnt, tick_spacing:float=None, palette:str='tab20')

Make bar plot from df[‘x’].value_counts()

	Type	Default	Details
cnt			from df[‘x’].value_counts()
tick_spacing	float	None	tick spacing for x axis
palette	str	tab20

# get count
cnt = aa_rdkit.fr_sulfide.round(3).value_counts()

# make plot
plot_count(cnt)

source

plot_bar

 plot_bar (df, value, group, title=None, figsize=(12, 5), fontsize=14,
           dots=True, rotation=90, ascending=False, data=None, x=None,
           y=None, hue=None, order=None, hue_order=None, estimator='mean',
           errorbar=('ci', 95), n_boot=1000, seed=None, units=None,
           weights=None, orient=None, color=None, palette=None,
           saturation=0.75, fill=True, hue_norm=None, width=0.8,
           dodge='auto', gap=0, log_scale=None, native_scale=False,
           formatter=None, legend='auto', capsize=0, err_kws=None,
           ci=<deprecated>, errcolor=<deprecated>, errwidth=<deprecated>,
           ax=None)

Plot bar graph from unstacked dataframe; need to indicate columns of values and categories

	Type	Default	Details
df
value			colname of value
group			colname of group
title	NoneType	None
figsize	tuple	(12, 5)
fontsize	int	14
dots	bool	True	whether or not add dots in the graph
rotation	int	90
ascending	bool	False
data	NoneType	None	Dataset for plotting. If `x` and `y` are absent, this is interpreted as wide-form. Otherwise it is expected to be long-form.
x	NoneType	None
y	NoneType	None
hue	NoneType	None
order	NoneType	None
hue_order	NoneType	None
estimator	str	mean	Statistical function to estimate within each categorical bin.
errorbar	tuple	(‘ci’, 95)	Name of errorbar method (either “ci”, “pi”, “se”, or “sd”), or a tuple with a method name and a level parameter, or a function that maps from a vector to a (min, max) interval, or None to hide errorbar. See the :doc:`errorbar tutorial </tutorial/error_bars>` for more information. .. versionadded:: v0.12.0
n_boot	int	1000	Number of bootstrap samples used to compute confidence intervals.
seed	NoneType	None	Seed or random number generator for reproducible bootstrapping.
units	NoneType	None	Identifier of sampling units; used by the errorbar function to perform a multilevel bootstrap and account for repeated measures
weights	NoneType	None	Data values or column used to compute weighted statistics. Note that the use of weights may limit other statistical options. .. versionadded:: v0.13.1
orient	NoneType	None	Orientation of the plot (vertical or horizontal). This is usually inferred based on the type of the input variables, but it can be used to resolve ambiguity when both `x` and `y` are numeric or when plotting wide-form data. .. versionchanged:: v0.13.0 Added ‘x’/‘y’ as options, equivalent to ‘v’/‘h’.
color	NoneType	None	Single color for the elements in the plot.
palette	NoneType	None	Colors to use for the different levels of the `hue` variable. Should be something that can be interpreted by :func:`color_palette`, or a dictionary mapping hue levels to matplotlib colors.
saturation	float	0.75	Proportion of the original saturation to draw fill colors in. Large patches often look better with desaturated colors, but set this to `1` if you want the colors to perfectly match the input values.
fill	bool	True	If True, use a solid patch. Otherwise, draw as line art. .. versionadded:: v0.13.0
hue_norm	NoneType	None	Normalization in data units for colormap applied to the `hue` variable when it is numeric. Not relevant if `hue` is categorical. .. versionadded:: v0.12.0
width	float	0.8	Width allotted to each element on the orient axis. When `native_scale=True`, it is relative to the minimum distance between two values in the native scale.
dodge	str	auto	When hue mapping is used, whether elements should be narrowed and shifted along the orient axis to eliminate overlap. If `"auto"`, set to `True` when the orient variable is crossed with the categorical variable or `False` otherwise. .. versionchanged:: 0.13.0 Added `"auto"` mode as a new default.
gap	int	0	Shrink on the orient axis by this factor to add a gap between dodged elements. .. versionadded:: 0.13.0
log_scale	NoneType	None	Set axis scale(s) to log. A single value sets the data axis for any numeric axes in the plot. A pair of values sets each axis independently. Numeric values are interpreted as the desired base (default 10). When `None` or `False`, seaborn defers to the existing Axes scale. .. versionadded:: v0.13.0
native_scale	bool	False	When True, numeric or datetime values on the categorical axis will maintain their original scaling rather than being converted to fixed indices. .. versionadded:: v0.13.0
formatter	NoneType	None	Function for converting categorical data into strings. Affects both grouping and tick labels. .. versionadded:: v0.13.0
legend	str	auto	How to draw the legend. If “brief”, numeric `hue` and `size` variables will be represented with a sample of evenly spaced values. If “full”, every group will get an entry in the legend. If “auto”, choose between brief or full representation based on number of levels. If `False`, no legend data is added and no legend is drawn. .. versionadded:: v0.13.0
capsize	int	0	Width of the “caps” on error bars, relative to bar spacing.
err_kws	NoneType	None	Parameters of :class:`matplotlib.lines.Line2D`, for the error bar artists. .. versionadded:: v0.13.0
ci	Deprecated		Level of the confidence interval to show, in [0, 100]. .. deprecated:: v0.12.0 Use `errorbar=("ci", ...)`.
errcolor	Deprecated		Color used for the error bar lines. .. deprecated:: 0.13.0 Use `err_kws={'color': ...}`.
errwidth	Deprecated		Thickness of error bar lines (and caps), in points. .. deprecated:: 0.13.0 Use `err_kws={'linewidth': ...}`.
ax	NoneType	None	Axes object to draw the plot onto, otherwise uses the current Axes.
Returns	matplotlib Axes		Returns the Axes object with the plot drawn onto it.

info['cat'] = (info.pKa1>2).astype(int)

# get an unstacked dataframe
plot_bar(info,value='MW',group='cat',palette='tab20')

source

plot_group_bar

 plot_group_bar (df, value_cols, group, figsize=(12, 5), order=None,
                 title=None, fontsize=14, rotation=90, data=None, x=None,
                 y=None, hue=None, hue_order=None, estimator='mean',
                 errorbar=('ci', 95), n_boot=1000, seed=None, units=None,
                 weights=None, orient=None, color=None, palette=None,
                 saturation=0.75, fill=True, hue_norm=None, width=0.8,
                 dodge='auto', gap=0, log_scale=None, native_scale=False,
                 formatter=None, legend='auto', capsize=0, err_kws=None,
                 ci=<deprecated>, errcolor=<deprecated>,
                 errwidth=<deprecated>, ax=None)

Plot grouped bar graph from dataframe.

	Type	Default	Details
df
value_cols			list of column names for values, the order depends on the first item
group			column name of group (e.g., ‘kinase’)
figsize	tuple	(12, 5)
order	NoneType	None
title	NoneType	None
fontsize	int	14
rotation	int	90
data	NoneType	None	Dataset for plotting. If `x` and `y` are absent, this is interpreted as wide-form. Otherwise it is expected to be long-form.
x	NoneType	None
y	NoneType	None
hue	NoneType	None
hue_order	NoneType	None
estimator	str	mean	Statistical function to estimate within each categorical bin.
errorbar	tuple	(‘ci’, 95)	Name of errorbar method (either “ci”, “pi”, “se”, or “sd”), or a tuple with a method name and a level parameter, or a function that maps from a vector to a (min, max) interval, or None to hide errorbar. See the :doc:`errorbar tutorial </tutorial/error_bars>` for more information. .. versionadded:: v0.12.0
n_boot	int	1000	Number of bootstrap samples used to compute confidence intervals.
seed	NoneType	None	Seed or random number generator for reproducible bootstrapping.
units	NoneType	None	Identifier of sampling units; used by the errorbar function to perform a multilevel bootstrap and account for repeated measures
weights	NoneType	None	Data values or column used to compute weighted statistics. Note that the use of weights may limit other statistical options. .. versionadded:: v0.13.1
orient	NoneType	None	Orientation of the plot (vertical or horizontal). This is usually inferred based on the type of the input variables, but it can be used to resolve ambiguity when both `x` and `y` are numeric or when plotting wide-form data. .. versionchanged:: v0.13.0 Added ‘x’/‘y’ as options, equivalent to ‘v’/‘h’.
color	NoneType	None	Single color for the elements in the plot.
palette	NoneType	None	Colors to use for the different levels of the `hue` variable. Should be something that can be interpreted by :func:`color_palette`, or a dictionary mapping hue levels to matplotlib colors.
saturation	float	0.75	Proportion of the original saturation to draw fill colors in. Large patches often look better with desaturated colors, but set this to `1` if you want the colors to perfectly match the input values.
fill	bool	True	If True, use a solid patch. Otherwise, draw as line art. .. versionadded:: v0.13.0
hue_norm	NoneType	None	Normalization in data units for colormap applied to the `hue` variable when it is numeric. Not relevant if `hue` is categorical. .. versionadded:: v0.12.0
width	float	0.8	Width allotted to each element on the orient axis. When `native_scale=True`, it is relative to the minimum distance between two values in the native scale.
dodge	str	auto	When hue mapping is used, whether elements should be narrowed and shifted along the orient axis to eliminate overlap. If `"auto"`, set to `True` when the orient variable is crossed with the categorical variable or `False` otherwise. .. versionchanged:: 0.13.0 Added `"auto"` mode as a new default.
gap	int	0	Shrink on the orient axis by this factor to add a gap between dodged elements. .. versionadded:: 0.13.0
log_scale	NoneType	None	Set axis scale(s) to log. A single value sets the data axis for any numeric axes in the plot. A pair of values sets each axis independently. Numeric values are interpreted as the desired base (default 10). When `None` or `False`, seaborn defers to the existing Axes scale. .. versionadded:: v0.13.0
native_scale	bool	False	When True, numeric or datetime values on the categorical axis will maintain their original scaling rather than being converted to fixed indices. .. versionadded:: v0.13.0
formatter	NoneType	None	Function for converting categorical data into strings. Affects both grouping and tick labels. .. versionadded:: v0.13.0
legend	str	auto	How to draw the legend. If “brief”, numeric `hue` and `size` variables will be represented with a sample of evenly spaced values. If “full”, every group will get an entry in the legend. If “auto”, choose between brief or full representation based on number of levels. If `False`, no legend data is added and no legend is drawn. .. versionadded:: v0.13.0
capsize	int	0	Width of the “caps” on error bars, relative to bar spacing.
err_kws	NoneType	None	Parameters of :class:`matplotlib.lines.Line2D`, for the error bar artists. .. versionadded:: v0.13.0
ci	Deprecated		Level of the confidence interval to show, in [0, 100]. .. deprecated:: v0.12.0 Use `errorbar=("ci", ...)`.
errcolor	Deprecated		Color used for the error bar lines. .. deprecated:: 0.13.0 Use `err_kws={'color': ...}`.
errwidth	Deprecated		Thickness of error bar lines (and caps), in points. .. deprecated:: 0.13.0 Use `err_kws={'linewidth': ...}`.
ax	NoneType	None	Axes object to draw the plot onto, otherwise uses the current Axes.
Returns	matplotlib Axes		Returns the Axes object with the plot drawn onto it.

plot_group_bar(info,['pKa1','pKb2'],'Name')

Box plot

source

plot_box

 plot_box (df, value, group, title=None, figsize=(6, 3), fontsize=14,
           dots=True, rotation=90, data=None, x=None, y=None, hue=None,
           order=None, hue_order=None, orient=None, color=None,
           palette=None, saturation=0.75, fill=True, dodge='auto',
           width=0.8, gap=0, whis=1.5, linecolor='auto', linewidth=None,
           fliersize=None, hue_norm=None, native_scale=False,
           log_scale=None, formatter=None, legend='auto', ax=None)

Plot box plot.

	Type	Default	Details
df
value			colname of value
group			colname of group
title	NoneType	None
figsize	tuple	(6, 3)
fontsize	int	14
dots	bool	True
rotation	int	90
data	NoneType	None	Dataset for plotting. If `x` and `y` are absent, this is interpreted as wide-form. Otherwise it is expected to be long-form.
x	NoneType	None
y	NoneType	None
hue	NoneType	None
order	NoneType	None
hue_order	NoneType	None
orient	NoneType	None	Orientation of the plot (vertical or horizontal). This is usually inferred based on the type of the input variables, but it can be used to resolve ambiguity when both `x` and `y` are numeric or when plotting wide-form data. .. versionchanged:: v0.13.0 Added ‘x’/‘y’ as options, equivalent to ‘v’/‘h’.
color	NoneType	None	Single color for the elements in the plot.
palette	NoneType	None	Colors to use for the different levels of the `hue` variable. Should be something that can be interpreted by :func:`color_palette`, or a dictionary mapping hue levels to matplotlib colors.
saturation	float	0.75	Proportion of the original saturation to draw fill colors in. Large patches often look better with desaturated colors, but set this to `1` if you want the colors to perfectly match the input values.
fill	bool	True	If True, use a solid patch. Otherwise, draw as line art. .. versionadded:: v0.13.0
dodge	str	auto	When hue mapping is used, whether elements should be narrowed and shifted along the orient axis to eliminate overlap. If `"auto"`, set to `True` when the orient variable is crossed with the categorical variable or `False` otherwise. .. versionchanged:: 0.13.0 Added `"auto"` mode as a new default.
width	float	0.8	Width allotted to each element on the orient axis. When `native_scale=True`, it is relative to the minimum distance between two values in the native scale.
gap	int	0	Shrink on the orient axis by this factor to add a gap between dodged elements. .. versionadded:: 0.13.0
whis	float	1.5	Paramater that controls whisker length. If scalar, whiskers are drawn to the farthest datapoint within whis IQR* from the nearest hinge. If a tuple, it is interpreted as percentiles that whiskers represent.
linecolor	str	auto	Color to use for line elements, when `fill` is True. .. versionadded:: v0.13.0
linewidth	NoneType	None	Width of the lines that frame the plot elements.
fliersize	NoneType	None	Size of the markers used to indicate outlier observations.
hue_norm	NoneType	None	Normalization in data units for colormap applied to the `hue` variable when it is numeric. Not relevant if `hue` is categorical. .. versionadded:: v0.12.0
native_scale	bool	False	When True, numeric or datetime values on the categorical axis will maintain their original scaling rather than being converted to fixed indices. .. versionadded:: v0.13.0
log_scale	NoneType	None	Set axis scale(s) to log. A single value sets the data axis for any numeric axes in the plot. A pair of values sets each axis independently. Numeric values are interpreted as the desired base (default 10). When `None` or `False`, seaborn defers to the existing Axes scale. .. versionadded:: v0.13.0
formatter	NoneType	None	Function for converting categorical data into strings. Affects both grouping and tick labels. .. versionadded:: v0.13.0
legend	str	auto	How to draw the legend. If “brief”, numeric `hue` and `size` variables will be represented with a sample of evenly spaced values. If “full”, every group will get an entry in the legend. If “auto”, choose between brief or full representation based on number of levels. If `False`, no legend data is added and no legend is drawn. .. versionadded:: v0.13.0
ax	NoneType	None	Axes object to draw the plot onto, otherwise uses the current Axes.
Returns	matplotlib Axes		Returns the Axes object with the plot drawn onto it.

plot_box(info,value='MW',group='cat',palette='tab20')

Pearson correlation

source

plot_corr

 plot_corr (x, y, xlabel=None, ylabel=None, data=None, text_location=[0.8,
            0.1], x_estimator=None, x_bins=None, x_ci='ci', scatter=True,
            fit_reg=True, ci=95, n_boot=1000, units=None, seed=None,
            order=1, logistic=False, lowess=False, robust=False,
            logx=False, x_partial=None, y_partial=None, truncate=True,
            dropna=True, x_jitter=None, y_jitter=None, label=None,
            color=None, marker='o', scatter_kws=None, line_kws=None,
            ax=None)

Given a dataframe and the name of two columns, plot the two columns’ correlation

	Type	Default	Details
x			x axis values, or colname of x axis
y			y axis values, or colname of y axis
xlabel	NoneType	None	x axis label
ylabel	NoneType	None	y axis label
data	NoneType	None	dataframe that contains data
text_location	list	[0.8, 0.1]
x_estimator	NoneType	None	Apply this function to each unique value of `x` and plot the resulting estimate. This is useful when `x` is a discrete variable. If `x_ci` is given, this estimate will be bootstrapped and a confidence interval will be drawn.
x_bins	NoneType	None	Bin the `x` variable into discrete bins and then estimate the central tendency and a confidence interval. This binning only influences how the scatterplot is drawn; the regression is still fit to the original data. This parameter is interpreted either as the number of evenly-sized (not necessary spaced) bins or the positions of the bin centers. When this parameter is used, it implies that the default of `x_estimator` is `numpy.mean`.
x_ci	str	ci	Size of the confidence interval used when plotting a central tendency for discrete values of `x`. If `"ci"`, defer to the value of the `ci` parameter. If `"sd"`, skip bootstrapping and show the standard deviation of the observations in each bin.
scatter	bool	True	If `True`, draw a scatterplot with the underlying observations (or the `x_estimator` values).
fit_reg	bool	True	If `True`, estimate and plot a regression model relating the `x` and `y` variables.
ci	int	95	Size of the confidence interval for the regression estimate. This will be drawn using translucent bands around the regression line. The confidence interval is estimated using a bootstrap; for large datasets, it may be advisable to avoid that computation by setting this parameter to None.
n_boot	int	1000	Number of bootstrap resamples used to estimate the `ci`. The default value attempts to balance time and stability; you may want to increase this value for “final” versions of plots.
units	NoneType	None	If the `x` and `y` observations are nested within sampling units, those can be specified here. This will be taken into account when computing the confidence intervals by performing a multilevel bootstrap that resamples both units and observations (within unit). This does not otherwise influence how the regression is estimated or drawn.
seed	NoneType	None	Seed or random number generator for reproducible bootstrapping.
order	int	1	If `order` is greater than 1, use `numpy.polyfit` to estimate a polynomial regression.
logistic	bool	False	If `True`, assume that `y` is a binary variable and use `statsmodels` to estimate a logistic regression model. Note that this is substantially more computationally intensive than linear regression, so you may wish to decrease the number of bootstrap resamples (`n_boot`) or set `ci` to None.
lowess	bool	False	If `True`, use `statsmodels` to estimate a nonparametric lowess model (locally weighted linear regression). Note that confidence intervals cannot currently be drawn for this kind of model.
robust	bool	False	If `True`, use `statsmodels` to estimate a robust regression. This will de-weight outliers. Note that this is substantially more computationally intensive than standard linear regression, so you may wish to decrease the number of bootstrap resamples (`n_boot`) or set `ci` to None.
logx	bool	False	If `True`, estimate a linear regression of the form y ~ log(x), but plot the scatterplot and regression model in the input space. Note that `x` must be positive for this to work.
x_partial	NoneType	None
y_partial	NoneType	None
truncate	bool	True	If `True`, the regression line is bounded by the data limits. If `False`, it extends to the `x` axis limits.
dropna	bool	True
x_jitter	NoneType	None
y_jitter	NoneType	None
label	NoneType	None	Label to apply to either the scatterplot or regression line (if `scatter` is `False`) for use in a legend.
color	NoneType	None	Color to apply to all plot elements; will be superseded by colors passed in `scatter_kws` or `line_kws`.
marker	str	o	Marker to use for the scatterplot glyphs.
scatter_kws	NoneType	None
line_kws	NoneType	None
ax	NoneType	None	Axes object to draw the plot onto, otherwise uses the current Axes.
Returns	matplotlib Axes		The Axes object containing the plot.

norm = Data.get_pspa_st_norm().iloc[:,:-6].T

norm.head()

kinase	AAK1	ACVR2A	ACVR2B	AKT1	AKT2	AKT3	ALK2	ALK4	ALPHAK3	AMPKA1	...	VRK1	VRK2	WNK1	WNK3	WNK4	YANK2	YANK3	YSK1	YSK4	ZAK
-5P	0.0720	0.0415	0.0533	0.0603	0.0602	0.0705	0.0536	0.0552	0.0571	0.0555	...	0.0710	0.0684	0.0482	0.0413	0.0369	0.0580	0.0625	0.0590	0.0593	0.0604
-5G	0.0245	0.0481	0.0517	0.0594	0.0617	0.0624	0.0659	0.0574	0.0478	0.0504	...	0.0786	0.0676	0.0510	0.0572	0.0523	0.0699	0.0776	0.0713	0.0728	0.0641
-5A	0.0284	0.0584	0.0566	0.0552	0.0643	0.0745	0.0662	0.0605	0.0253	0.0534	...	0.0633	0.0636	0.0555	0.0503	0.0539	0.0637	0.0647	0.0731	0.0744	0.0659
-5C	0.0456	0.0489	0.0772	0.0605	0.0582	0.0628	0.0762	0.0483	0.0384	0.0588	...	0.0641	0.0644	0.0576	0.0732	0.0544	0.0602	0.0598	0.0606	0.0734	0.0631
-5S	0.0425	0.0578	0.0533	0.0516	0.0534	0.0442	0.0567	0.0574	0.0571	0.0504	...	0.0595	0.0573	0.0561	0.0569	0.0580	0.0580	0.0545	0.0542	0.0597	0.0597

5 rows × 303 columns

plot_corr(data=norm, x='AAK1', y='BIKE')

Matrix heatmap

source

get_similarity

 get_similarity (df, metric='euclidean')

Calculate distance matrix of a df; also return inverse df (similarity df)

source

plot_matrix

 plot_matrix (dist_matrix, inverse_color=False)

Plot distance/similarity matrix

rdkit_corr = aa_rdkit.T.corr()
rdkit_corr.head()

aa	A	C	D	E	F	G	H	I	K	L	...	S	T	V	W	Y	s	t	y	Kac	Kme3
aa
A	1.000000	0.362285	0.090512	-0.112693	-0.189588	0.725605	0.000479	0.189212	-0.099613	0.270430	...	0.560214	0.377420	0.294869	-0.345882	-0.292322	-0.257989	-0.294085	-0.540545	-0.446649	-0.353592
C	0.362285	1.000000	-0.064188	-0.173111	-0.249098	0.409469	0.009036	0.008175	-0.245936	-0.027502	...	0.298754	0.143817	0.101138	-0.221078	-0.274137	-0.039459	-0.154141	-0.339660	-0.297758	-0.210207
D	0.090512	-0.064188	1.000000	0.803788	-0.305487	0.065810	-0.211706	-0.272792	-0.229956	-0.263288	...	0.302876	0.228545	-0.181244	-0.340342	-0.191931	0.386431	0.295387	-0.044046	-0.131867	-0.231809
E	-0.112693	-0.173111	0.803788	1.000000	-0.242507	-0.068794	-0.239756	-0.210293	-0.106431	-0.265852	...	0.042331	-0.043214	-0.171171	-0.251833	-0.049223	0.241647	0.166818	0.097376	0.125016	-0.177356
F	-0.189588	-0.249098	-0.305487	-0.242507	1.000000	-0.155484	0.011415	0.093393	0.019161	0.109600	...	-0.352333	-0.328142	-0.001738	0.660432	0.453157	-0.359352	-0.300223	0.252583	-0.089912	0.023859

5 rows × 25 columns

plot_matrix(rdkit_corr)

AUCDF

source

get_AUCDF

 get_AUCDF (df, col, reverse=False, plot=True, xlabel='Rank of reported
            kinase')

Plot CDF curve and get relative area under the curve

get_AUCDF(sorted_df,'values')

0.8754977337946649

Confusion matrix

source

plot_confusion_matrix

 plot_confusion_matrix (target, pred, class_names:list=['0', '1'],
                        normalize=False, title='Confusion matrix',
                        cmap=<matplotlib.colors.LinearSegmentedColormap
                        object at 0x7fbf483ae6e0>)

Plot the confusion matrix.

	Type	Default	Details
target			pd.Series
pred			pd.Series
class_names	list	[‘0’, ‘1’]
normalize	bool	False
title	str	Confusion matrix
cmap	LinearSegmentedColormap	<matplotlib.colors.LinearSegmentedColormap object at 0x7fbf483ae6e0>

target = info.MW<160
pred = info.pKa1>2.1

plot_confusion_matrix(target,pred,normalize=True)

Normalized confusion matrix

End

# #| export
# def get_logo2(full: pd.DataFrame, # a dataframe that contains the full matrix of a kinase, with index as amino acid, and columns as positions
#               title: str = 'logo', # title of the graph
#               ):
    
#     "Plot logo from a full freqency matrix of a kinase"
    
#     # get S,T,Y ratio
#     S_ratio,T_ratio,Y_ratio = full[0][['s','t','y']]/full[0][['s','t','y']].sum()
    
#     # drop position 0 
#     full = full.drop(columns=[0])
    
#     # identify the minimum value other than 0
#     min_val = full[full > 0].min().min()
    
#     # replace 0s with the identified minimum value
#     full = full.replace(0, min_val)
    
#     norm_p = full.T

#     # calculate ratio, use substraction
#     ratio =norm_p.apply(lambda r: r-r.median(),axis=1)
    
#     # calculate ratio based on previous method, divide values by median, followed by log2
#     # ratio =norm_p.apply(lambda r: r/r.median(),axis=1)
#     # ratio = np.log2(ratio)
    
#     # get the max value for a position
#     m = ratio.apply(lambda row: row[row > 0].sum(), axis=1).max()
    
#     # get the relative height of S,T,Y relative to the max value
#     new_row = pd.DataFrame({'S': S_ratio*m, 'T':T_ratio*m,'Y':Y_ratio*m}, index=[0]) 
    
#     # prepare the matrix for logomaker
#     ratio2 = pd.concat([ratio, new_row], ignore_index=False).fillna(0)

#     # logo_func(ratio2,title)
#     plot_logo_raw(ratio2.T,title=title)

# # get kinase-substrate dataset
# df = Data.get_ks_dataset()

# # get data for a specific kinase
# df_k = df.query('kinase == "DYRK2"')

# # get the full freq matrix
# _,full = get_freq(ks_k)

# # plot logo
# get_logo2(full,'DYRK2')