from katlas.data import *
Mutual information
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OneHotEncoder
from itertools import combinations
= Data.get_ks_dataset() df
= df[df.kinase_uniprot=='P06493'] df
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.feature_selection import mutual_info_classif
import seaborn as sns
import matplotlib.pyplot as plt
# Example peptide sequence input
= df['site_seq'].tolist() # List of aligned peptide sequences
sequences = len(sequences[0]) # Peptide length (assumed all same length)
L
# Convert sequences to character matrix (N_peptides, L)
= np.array([list(seq) for seq in sequences])
peptide_position_matrix
# Convert each column to pandas categorical (required for MI calc)
= pd.DataFrame(peptide_position_matrix)
peptide_df
# Encode as category for MI (sklearn requires numeric, so we convert categories to codes)
for col in peptide_df.columns:
= pd.Categorical(peptide_df[col]).codes
peptide_df[col]
# Compute mutual information matrix
= np.zeros((L, L))
mi_matrix
for i, j in combinations(range(L), 2):
= peptide_df[i].values.reshape(-1, 1)
xi = peptide_df[j].values
xj = mutual_info_classif(xi, xj, discrete_features=True)
mi = mi[0]
mi_matrix[i, j] = mi[0] # Make symmetric mi_matrix[j, i]
# Step 1: Re-index positions from -20 to +20
= peptide_df.shape[1]
L = 20 # position 0 is at index 20
center_idx = [i - 20 for i in range(L)]
pos_labels
# Step 3: Mask the upper triangle
= np.triu(np.ones_like(mi_matrix, dtype=bool))
mask
# Step 4: Plot heatmap
=(10, 8))
plt.figure(figsize=mask, cmap='viridis',
sns.heatmap(mi_matrix, mask=pos_labels, yticklabels=pos_labels,
xticklabels=True, cbar_kws={'label': 'Mutual Information'})
square
"Mutual Information Between Peptide Positions (excluding Y)")
plt.title("Position (relative to central Y)")
plt.xlabel("Position (relative to central Y)")
plt.ylabel(
plt.tight_layout() plt.show()
# Step 1: Re-index positions from -20 to +20
= peptide_df.shape[1]
L = 20 # position 0 is at index 20
center_idx = [i - 20 for i in range(L)]
pos_labels
# Step 2: Remove position 0 (row and column from MI matrix and label)
= np.delete(mi_matrix, center_idx, axis=0)
mi_matrix_noY = np.delete(mi_matrix_noY, center_idx, axis=1)
mi_matrix_noY = [p for p in pos_labels if p != 0]
pos_labels_noY
# Step 3: Mask the upper triangle
= np.triu(np.ones_like(mi_matrix_noY, dtype=bool))
mask
# Step 4: Plot heatmap
=(10, 8))
plt.figure(figsize=mask, cmap='viridis',
sns.heatmap(mi_matrix_noY, mask=pos_labels_noY, yticklabels=pos_labels_noY,
xticklabels=True, cbar_kws={'label': 'Mutual Information'})
square
"Mutual Information Between Peptide Positions (excluding Y)")
plt.title("Position (relative to central Y)")
plt.xlabel("Position (relative to central Y)")
plt.ylabel(
plt.tight_layout() plt.show()
i
39
= mi_matrix_noY.shape[0]
L = []
mi_scores
for i in range(L):
= mi_matrix_noY[i, :i] # values below diagonal
row = mi_matrix_noY[i+1:, i] # values above diagonal
col = np.concatenate([row, col])
mi_all # or sum(), max(), etc.
mi_scores.append(np.mean(mi_all))
# Plot position-wise MI
= [i for i in range(-20, 21) if i != 0]
pos_labels_noY
=(10, 4))
plt.figure(figsize='o')
plt.plot(pos_labels_noY, mi_scores, marker"Average Mutual Information per Position (excluding central Y)")
plt.title("Position (relative to central Y)")
plt.xlabel("Avg MI with other positions")
plt.ylabel(True)
plt.grid(
plt.tight_layout() plt.show()