import pandas as pd
Process AlphaMissense dataset
Uncheck below to download the AlphaMissense data:
# !wget https://zenodo.org/records/8208688/files/AlphaMissense_aa_substitutions.tsv.gz
= pd.read_csv('AlphaMissense_aa_substitutions.tsv.gz', compression='gzip', header=3, sep='\t', quotechar='"') df
CPU times: user 1min 2s, sys: 37.8 s, total: 1min 39s
Wall time: 3min 6s
'position'] = df.protein_variant.str[1:-1].astype(int) df[
'aa_position'] = df.protein_variant.str[:-1] df[
= df.groupby(['uniprot_id','aa_position'])['am_pathogenicity'].mean().reset_index() df_mean
0]//df_mean.shape[0] df.shape[
19
'position'] = df_mean.aa_position.str[1:].astype(int) df_mean[
= df_mean.sort_values(by=['uniprot_id','position']).reset_index(drop=True) df_mean
'raw/AM_mean.parquet') df_mean.to_parquet(
df_mean
uniprot_id | aa_position | am_pathogenicity | position | |
---|---|---|---|---|
0 | A0A024R1R8 | M1 | 0.414942 | 1 |
1 | A0A024R1R8 | S2 | 0.231174 | 2 |
2 | A0A024R1R8 | S3 | 0.188332 | 3 |
3 | A0A024R1R8 | H4 | 0.141832 | 4 |
4 | A0A024R1R8 | E5 | 0.245732 | 5 |
... | ... | ... | ... | ... |
11377646 | X6R8D5 | V123 | 0.296089 | 123 |
11377647 | X6R8D5 | L124 | 0.260532 | 124 |
11377648 | X6R8D5 | W125 | 0.901605 | 125 |
11377649 | X6R8D5 | R126 | 0.326921 | 126 |
11377650 | X6R8D5 | S127 | 0.407611 | 127 |
11377651 rows × 4 columns