scatter

Dimensionality-reduction, scatter, and correlation.

df = sns.load_dataset('penguins').dropna().reset_index(drop=True)
df2 = df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
print(df.shape)
print(df2.shape)

(333, 7)
(333, 4)

df.head()

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
3	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female
4	Adelie	Torgersen	39.3	20.6	190.0	3650.0	Male

Dimensionality Reduction

reduce_feature


def reduce_feature(
    df:DataFrame, # numeric feature matrix
    method:str='pca', # one of pca, tsne, umap
    complexity:int=20, # perplexity for tsne or neighbors for umap
    n:int=2, # number of output dimensions
    load:str | pathlib.Path | None=None, # path to a previously fitted reducer
    save:str | pathlib.Path | None=None, # optional path for persisting the reducer
    seed:int=123, # random_state used by reducers that support it
    kwargs:VAR_KEYWORD
)->DataFrame: # forwarded reducer kwargs

Reduce a feature matrix to a lower-dimensional embedding dataframe.

reduce_feature(df2, method='pca', n=2)

	PCA1	PCA2
0	-457.325073	-13.351587
1	-407.252205	-9.179113
2	-957.044676	8.160444
3	-757.115802	1.867653
4	-557.177302	-3.389158
...	...	...
328	718.068699	2.338199
329	643.090909	4.280699
330	1543.098355	-2.232010
331	992.994900	-4.605154
332	1193.002584	-5.417312

333 rows × 2 columns

Scatter Plots

plot_2d


def plot_2d(
    embedding_df:DataFrame, # dataframe with at least two numeric columns
    hue:str | None=None, # column name used for color when present in embedding_df
    palette:str='tab20', # seaborn palette name
    legend:bool=False, # whether to draw a legend
    name_list:list[str] | None=None, # labels used to annotate points
    s:int=20, # marker size
    legend_title:str | None=None, # optional legend title override
    kwargs:VAR_KEYWORD
):

Plot the first two columns of an embedding dataframe.

df2 = reduce_feature(df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']], method='pca', n=2)
df2['species'] = df['species'].values
plot_2d(df2, hue='species', legend=True)

plot_cluster


def plot_cluster(
    df:DataFrame, # numeric feature matrix, optionally including a hue column
    method:str='pca', # one of pca, tsne, umap
    hue:str | pandas.Series | list | None=None, # hue column name or per-row hue values
    complexity:int=30, # perplexity for tsne or neighbors for umap
    palette:str='tab20', # seaborn palette name
    legend:bool=False, # whether to draw a legend
    name_list:list[str] | None=None, # point annotations
    seed:int=123, # random seed passed to the reducer
    s:int=50, # marker size
    legend_title:str | None=None, # optional legend title override
    kwargs:VAR_KEYWORD
):

Reduce features and immediately plot the first two embedding dimensions.

plot_cluster(df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'species']], method='pca', hue='species', legend=True)

Correlation Plot

plot_rel


def plot_rel(
    df:DataFrame, # dataframe that contains the x and y columns
    x:str, # x-axis column name
    y:str, # y-axis column name
    text_location:tuple=(0.8, 0.1), # annotation location in axes coordinates
    method:str | None='spearman', # one of spearman, pearson, or None
    index_list:list[str] | None=None, # row labels to annotate
    hue:str | None=None, # optional categorical hue column
    reg_line:bool=True, # whether to draw a regression line when hue is used
    data:NoneType=None, x_estimator:NoneType=None, x_bins:NoneType=None, x_ci:str='ci', scatter:bool=True,
    fit_reg:bool=True, ci:int=95, n_boot:int=1000, units:NoneType=None, seed:NoneType=None, order:int=1,
    logistic:bool=False, lowess:bool=False, robust:bool=False, logx:bool=False, x_partial:NoneType=None,
    y_partial:NoneType=None, truncate:bool=True, dropna:bool=True, x_jitter:NoneType=None, y_jitter:NoneType=None,
    label:NoneType=None, color:NoneType=None, marker:str='o', scatter_kws:NoneType=None, line_kws:NoneType=None,
    ax:NoneType=None
):

Plot a pairwise relationship with an optional correlation annotation.

df2 = df[['bill_length_mm', 'flipper_length_mm', 'species']].head(12).copy()
df2.index = [f'pt{i}' for i in range(len(df2))]
plot_rel(df2, x='bill_length_mm', y='flipper_length_mm', hue='species', index_list=['pt0', 'pt11'])