Process AlphaMissense dataset

import pandas as pd

Uncheck below to download the AlphaMissense data:

# !wget https://zenodo.org/records/8208688/files/AlphaMissense_aa_substitutions.tsv.gz
df = pd.read_csv('AlphaMissense_aa_substitutions.tsv.gz', compression='gzip',  header=3, sep='\t', quotechar='"')
CPU times: user 1min 2s, sys: 37.8 s, total: 1min 39s
Wall time: 3min 6s
df['position'] = df.protein_variant.str[1:-1].astype(int)
df['aa_position'] =  df.protein_variant.str[:-1]
df_mean = df.groupby(['uniprot_id','aa_position'])['am_pathogenicity'].mean().reset_index()
df.shape[0]//df_mean.shape[0]
19
df_mean['position'] = df_mean.aa_position.str[1:].astype(int)
df_mean = df_mean.sort_values(by=['uniprot_id','position']).reset_index(drop=True)
df_mean.to_parquet('raw/AM_mean.parquet')
df_mean
uniprot_id aa_position am_pathogenicity position
0 A0A024R1R8 M1 0.414942 1
1 A0A024R1R8 S2 0.231174 2
2 A0A024R1R8 S3 0.188332 3
3 A0A024R1R8 H4 0.141832 4
4 A0A024R1R8 E5 0.245732 5
... ... ... ... ...
11377646 X6R8D5 V123 0.296089 123
11377647 X6R8D5 L124 0.260532 124
11377648 X6R8D5 W125 0.901605 125
11377649 X6R8D5 R126 0.326921 126
11377650 X6R8D5 S127 0.407611 127

11377651 rows × 4 columns