import pandas as pdProcess AlphaMissense dataset
Uncheck below to download the AlphaMissense data:
# !wget https://zenodo.org/records/8208688/files/AlphaMissense_aa_substitutions.tsv.gzdf = pd.read_csv('AlphaMissense_aa_substitutions.tsv.gz', compression='gzip', header=3, sep='\t', quotechar='"')CPU times: user 1min 2s, sys: 37.8 s, total: 1min 39s
Wall time: 3min 6s
df['position'] = df.protein_variant.str[1:-1].astype(int)df['aa_position'] = df.protein_variant.str[:-1]df_mean = df.groupby(['uniprot_id','aa_position'])['am_pathogenicity'].mean().reset_index()df.shape[0]//df_mean.shape[0]19
df_mean['position'] = df_mean.aa_position.str[1:].astype(int)df_mean = df_mean.sort_values(by=['uniprot_id','position']).reset_index(drop=True)df_mean.to_parquet('raw/AM_mean.parquet')df_mean| uniprot_id | aa_position | am_pathogenicity | position | |
|---|---|---|---|---|
| 0 | A0A024R1R8 | M1 | 0.414942 | 1 |
| 1 | A0A024R1R8 | S2 | 0.231174 | 2 |
| 2 | A0A024R1R8 | S3 | 0.188332 | 3 |
| 3 | A0A024R1R8 | H4 | 0.141832 | 4 |
| 4 | A0A024R1R8 | E5 | 0.245732 | 5 |
| ... | ... | ... | ... | ... |
| 11377646 | X6R8D5 | V123 | 0.296089 | 123 |
| 11377647 | X6R8D5 | L124 | 0.260532 | 124 |
| 11377648 | X6R8D5 | W125 | 0.901605 | 125 |
| 11377649 | X6R8D5 | R126 | 0.326921 | 126 |
| 11377650 | X6R8D5 | S127 | 0.407611 | 127 |
11377651 rows × 4 columns