import numpy as np,pandas as pd
from matplotlib import pyplot as plt
import plotly.graph_objects as go
import plotly.io as pio
import scipy.stats as stats
Sankey diagram
Data is a csv/excel file, with H1-H3 columns as control and H4-H6 as experimental file, the value represent RNA level (not know if it is log2 transformed already, but use Avg2/Avg1 for FC)
Calculate p-value
= pd.read_csv('hong_data.csv',na_values=['#N/A'])
df 'Gene'] = df['human_gene']
df[
# Remove zero as it affect p value
'H1', 'H2', 'H3', 'H4', 'H5', 'H6']] = df[['H1', 'H2', 'H3', 'H4', 'H5', 'H6']].replace(0, np.nan)
df[[
'Control'] = df[['H1','H2','H3']].mean(1)
df['Experiment'] = df[['H4','H5','H6']].mean(1)
df[
# Calculate fold change and log fold change
'Fold Change'] = df['Experiment'] / df['Control']
df['Log2 Fold Change'] = np.log2(df['Fold Change'])
df[
# Perform t-test between Control and Experiment groups
= ['H1', 'H2', 'H3']
control_cols = ['H4', 'H5', 'H6'] experiment_cols
def ttest_row(row):
# Extract values from each group and drop NaN values
= row[['H1', 'H2', 'H3']].dropna().astype(float) # Group 1: Drop NaN and convert to float
group1 = row[['H4', 'H5', 'H6']].dropna().astype(float) # Group 2: Drop NaN and convert to float
group2
# Perform t-test only if both groups have at least 2 values
if len(group1) > 1 and len(group2) > 1:
= stats.ttest_ind(group1, group2, equal_var=False) # Welch's t-test (unequal variance),which is more robust when the two groups may not have the same variance.
t_stat, p_value return pd.Series({'T-statistic': t_stat, 'P-value': p_value})
else:
return pd.Series({'T-statistic': np.nan, 'P-value': np.nan})
# Apply the function row-wise
= df.apply(ttest_row, axis=1) result
= pd.concat([df,result],axis=1) df
# Compute the -log10 of the 'P-value'
'log10_P'] = -np.log10(df['P-value'])
df[
# Assign the sign of 'Log2 Fold Change' to the 'log10_P' values
'signed_log10_P'] = np.sign(df['Log2 Fold Change']) * df['log10_P'] df[
'ttest_processed_zero_exclude.csv',index=False) df.to_csv(
Pathway analysis
Get up-regulated genes and down-regulated genes with p-value<0.05; do analysis separately.
Go to DAVID bioinformatics website, go to Upload –> Step 1: paste gene symbols –> Step2: Official Gene Symbol, select species, if mouse: mus, if human: homo, –> Step3: select Gene List –> Step4: submit.
Click start analysis
, go to GO-BP, click chart, download gene list.
(Optional) Plot gene expression
= df.dropna(subset=['Fold Change']) # Drop NaN values
df_cleaned = df_cleaned[df_cleaned['Fold Change'] != 0] # Drop 0 values
df_cleaned = df_cleaned[np.isfinite(df_cleaned['Fold Change'])] # Drop inf and -inf values df_cleaned
'zero'] = 0 df_cleaned[
= df_cleaned.sort_values('signed_log10_P',ascending=False).reset_index(drop=True) df_cleaned
= df_cleaned.query('`P-value` <=0.05').copy() df_sig
100),df_sig.tail(100)]),'human_gene','zero','signed_log10_P') plot_sankey(pd.concat([df_sig.head(
Sankey for pathway analysis
def plot_sankey(df, gene_col, col1, col2,figsize=(1000,600),write=False,link_colors=None,link_values=None):
= df.copy()
df
# Combine control and experiment values into a single array for joint normalization
= pd.concat([df[col1], df[col2]])
all_values
# Normalize all values together
= (all_values - all_values.min()) / (all_values.max() - all_values.min())
normalized_values
= (1-normalized_values)+1e-13
normalized_values
# Split back into control and experiment
f'{col1}_norm'] = normalized_values[:len(df)]
df[f'{col2}_norm'] = normalized_values[len(df):]
df[
# Define the source (control) and target (experiment) nodes
= list(range(len(df))) # Control nodes (one per gene)
sources = list(range(len(df), 2 * len(df))) # Experiment nodes (one per gene)
targets
= [[0, orig_y, inv_y] for orig_y, inv_y in zip(df[col1], df[f'{col1}_norm'])] + \
customdata 0.5, orig_y, inv_y] for orig_y, inv_y in zip(df[col2], df[f'{col2}_norm'])]
[[
# Create the Sankey diagram with y-axis positions reflecting joint-normalized values and uniform height
= go.Figure(go.Sankey(
fig = "perpendicular",
arrangement =dict(
node=15,
pad=10, # Uniform thickness for all nodes
thickness=dict(color="black", width=0.5),
line=['']*len(df) + df[gene_col].tolist(), # Labels for control and experiment nodes
label=["blue"] * len(df)*2 if not link_colors else link_colors*2, # Blue for control, green for experiment
color=[0.01] * len(df) + [0.99] * len(df), # Left (x=0) for control, right (x=1) for experiment
x=df[f'{col1}_norm'].tolist() + df[f'{col2}_norm'].tolist(), # Y positions based on joint-normalized values
y=customdata,
customdata='Gene: %{label}<br>Value: %{customdata[1]:.2f}'
hovertemplate
),=dict(
link=sources, # Link control nodes to experiment nodes
source=targets,
target=[0.01]*(len(df)) if not link_values else link_values, # the expression is relative, changing 1 to 0.1 to all would not change as no variation
value="rgba(0, 150, 255, 0.5)" if not link_colors else link_colors # Color of the links
color
)
))
# Update layout with titles
fig.update_layout(="Gene Expression Changes",
title_text=10,
font_size=False,
autosize=figsize[0],
width=figsize[1],
height=dict(r=300)
margin
)
# Save the figure as an HTML file
if write:
file="sankey_diagram.html", auto_open=False)
pio.write_html(fig, # Show the figure
fig.show()
From the DAVID output csv, pick out important pathways.
= pd.read_csv('hong_highlight.csv')
df 'term'] = df['Term'].str.split('~').str[-1]
df['term'] = df['term'].apply(lambda x: x[0].upper() + x[1:] if len(x) > 0 else x)
df[
'y_left'] = df.direction.map({'up':0.01,'down':0.99})
df['y_right'] = df.direction.map({'up':0.99,'down':0.01})
df[
# fourth value is transparency
'rgba_colors'] = df.direction.map({'up':"rgba(255, 99, 71, 0.5)",'down':"rgba(38, 101, 255, 0.5)"})
df[
'term',col1='y_left',col2='y_right',figsize=(1000,500),
plot_sankey(df,=df.rgba_colors.tolist(),
link_colors=df.signed_log10P.abs().tolist(),
link_values=True
write )
Bar plot
import seaborn as sns
import matplotlib.pyplot as plt
# Set the output format to SVG
= pd.read_csv('hong_highlight.csv')
df 'term'] = df['Term'].str.split('~').str[-1]
df['term'] = df['term'].apply(lambda x: x[0].upper() + x[1:] if len(x) > 0 else x)
df[
= df.set_index('term')['signed_log10P'].sort_values()
data = data.apply(lambda x: 'darkred' if x > 0 else 'mediumblue')
colors
=colors)
data.plot.barh(color'')
plt.ylabel('Signed log10(p-value)')
plt.xlabel('TAC vs. Vehicle \n GO Biological Process Pathway Analysis')
plt.title('barplot.svg',bbox_inches='tight') plt.savefig(
Others
def plot_sankey_multicol(df, columns):
= df.copy()
df = len(columns)
num_col = pd.concat([df[col] for col in columns])
all_values
# Normalize all values together
= (all_values - all_values.min()) / (all_values.max() - all_values.min())
normalized_values
# invert the value as html use 0 as highest value
= 1-normalized_values
normalized_values
# plotly can't deal with 0 value, so add a small number to 0
= [i+1e-13 if i ==0 else i for i in normalized_values]
normalized_values
# Assign normalized values back to DataFrame in new columns
for i, col in enumerate(columns):
+ '_norm'] = normalized_values[i*len(df):(i+1)*len(df)]
df[col
# Prepare data for Sankey diagram
= []
nodes = []
node_colors = []
node_x = []
node_y
# Define colors for different conditions
= ['blue', 'green', 'red', 'purple', 'orange', 'yellow'] # Extend colors as needed
colors = [i/(num_col-1) for i in range(num_col)] # Positions along x-axis
position_x # as it can't deal with 0 effectively, add a small number
=[i+1e-13 if i ==0 else i for i in position_x]
position_x
# Create nodes for each condition
for i, col in enumerate(columns):
'Gene'].tolist())
nodes.extend(df[% len(colors)]] * len(df))
node_colors.extend([colors[i
* len(df))
node_x.extend([position_x[i]] + '_norm'].tolist())
node_y.extend(df[col
# Define source and target indices for links
= []
source = []
target for i in range(num_col - 1):
range(i*len(df), (i+1)*len(df)))
source.extend(range((i+1)*len(df), (i+2)*len(df)))
target.extend(
# Define link properties
= [0.1] * (len(source)-1)+[0.1+0.000001] # All links have a uniform flow size (can be adjusted)
link_value = 'rgba(0, 150, 255, 0.5)' # Link color
link_color
# Create the Sankey diagram
= go.Figure(go.Sankey(
fig ="fixed",
arrangement=dict(
node=15,
pad=20,
thickness=dict(color="black", width=0.5),
line=nodes,
label=node_colors,
color=node_x,
x=node_y
y
),=dict(
link=source,
source=target,
target=link_value,
value=link_color
color
)
))
# Update layout with titles
fig.update_layout(="Gene Expression Changes: Control vs Experiment",
title_text=10,
font_size=False,
autosize=1000,
width=600
height
)
# Show the figure
fig.show()
# Example usage
= pd.DataFrame({
df 'Gene': ['Gene1', 'Gene2', 'Gene3', 'Gene4', 'Gene5', 'Gene6', 'Gene7', 'Gene8', 'Gene9', 'Gene10'],
'Control': [1.0]*10,
'Experiment1': [2.0]*10,
'Experiment2': [3]*9+[1]*1
})'Control', 'Experiment1', 'Experiment2']) plot_sankey_multicol(df, [