#default_exp datasets
Datasets
Includes the functions:
construct_diamonds
, make_tree
, and make_swiss_roll
#hide
from nbdev.showdoc import *
#export
import os
import pandas as pd, numpy as np
import phate
from sklearn import datasets
import seaborn as sns
"bright")
sns.color_palette(import matplotlib as mpl
#export
def construct_diamond(
int=200,
points_per_petal:float=0.25,
petal_width:str='y'
direction:
):'''
Arguments:
----------
points_per_petal (int). Defaults to `200`. Number of points per petal.
petal_width (float): Defaults to `0.25`. How narrow the diamonds are.
direction (str): Defaults to 'y'. Options `'y'` or `'x'`. Whether to make vertical
or horizontal diamonds.
Returns:
---------
points (numpy.ndarray): the 2d array of points.
'''
= int(points_per_petal/2)
n_side = np.concatenate((
axis_1 0, petal_width, int(n_side/2)),
np.linspace(0, int(n_side/2))
np.linspace(petal_width,
))= np.linspace(0, 1, n_side)
axis_2 = (axis_1, axis_2) if direction == 'y' else (axis_2, axis_1)
axes = np.vstack(axes).T
points = np.vstack((points, -1*points))
points = np.vstack((points, np.vstack((points[:, 0], -1*points[:, 1])).T))
points return points
def make_diamonds(
int=200,
points_per_petal:float=0.25,
petal_width:int=5,
colors:float=30,
scale_factor:bool=True
use_gaussian:
):'''
Arguments:
----------
points_per_petal (int). Defaults to `200`. Number of points per petal.
petal_width (float): Defaults to `0.25`. How narrow the diamonds are.
colors (int): Defaults to `5`. The number of timesteps (colors) to produce.
scale_factor (float): Defaults to `30`. How much to scale the noise by
(larger values make samller noise).
use_gaussian (bool): Defaults to `True`. Whether to use random or gaussian noise.
Returns:
---------
df (pandas.DataFrame): DataFrame with columns `samples`, `x`, `y`, where `samples`
are the time index (corresponds to colors)
'''
= construct_diamond(points_per_petal, petal_width, 'y')
upper = construct_diamond(points_per_petal, petal_width, 'x')
lower = np.vstack((upper, lower))
data
= np.random.randn if use_gaussian else np.random.rand
noise_fn = noise_fn(*data.shape) / scale_factor
noise = data + noise
data = pd.DataFrame(data, columns=['d1', 'd2'])
df
= np.linspace(colors, 1, colors)
c_values = np.linspace(1, 0+1/(colors+1), colors)
c_thresholds 0, 'samples', colors)
df.insert('samples'] = colors
df[for value, threshold in zip(c_values, c_thresholds):
= ((np.abs(df.d1) <= threshold) & (np.abs(df.d2) <= threshold))
index 'samples'] = value
df.loc[index, 'samples')
df.set_index(return df
= make_diamonds(200, 0.25, 5)
df =df, x='d1', y='d2', hue='samples', palette='viridis') sns.scatterplot(data
<AxesSubplot:xlabel='x', ylabel='y'>
#export
def make_swiss_roll(n_points=1500):
'''
Arguments:
----------
n_points (int): Default to `1500`.
Returns:
---------
df (pandas.DataFrame): DataFrame with columns `samples`, `d1`, `d2`, `d3`,
where `samples` are the time index (corresponds to colors)
'''
= datasets.make_swiss_roll(n_samples=n_points)
X, color = pd.DataFrame(np.hstack((np.round(color).reshape(-1, 1), X)), columns='samples d1 d2 d3'.split())
df -= np.min(df.samples)
df.samples return df
#export
def make_tree():
'''
Arguments:
----------
n_points (int): Default to `1500`.
Returns:
---------
df (pandas.DataFrame): DataFrame with columns `samples`, `d1`, `d2`, `d3`,
`d4`, `d5` where `samples` are the time index (corresponds to colors)
'''
= phate.tree.gen_dla(
tree, branches = 200, n_branch = 10, branch_length = 300,
n_dim = 2, seed=37, sigma = 5
rand_multiplier
)= phate.PHATE(n_components=5, n_jobs=-1)
phate_operator = phate_operator.fit_transform(tree)
tree_phate = pd.DataFrame(np.hstack((branches.reshape(-1, 1), tree_phate)), columns='samples d1 d2 d3 d4 d5'.split())
df return df
#export
from MIOFlow.constants import WORM_FILE
def make_worm_data():
= np.load(WORM_FILE)
data = data['sample_labels']
sample_labels = data['embedding']
embedding = pd.concat([
df =['samples']),
pd.DataFrame(sample_labels, columns=list(map(lambda e: f'd{e}', '12345')))
pd.DataFrame(embedding, columns=1,
], axis
)'samples')
df.set_index(return df
#export
from MIOFlow.constants import EB_BODIES_FILE,EB_BODIES_PSEUDO_4,EB_BODIES_PSEUDO_6,EB_BODIES_PSEUDO_25,EB_BODIES_PSEUDO_82
def make_eb_data(phate=False, phate_dims=5,n_sample='all', random_state=1):
= np.load(EB_BODIES_FILE)
data = data['sample_labels']
sample_labels = data['pca']
embedding = pd.DataFrame(embedding, columns=[f'd{i}' for i in range(1, 101)])
df 'samples'] = sample_labels
df['samples')
df.set_index('pt4'] = np.load(EB_BODIES_PSEUDO_4)
df['pt6'] = np.load(EB_BODIES_PSEUDO_6)
df['pt25'] = np.load(EB_BODIES_PSEUDO_25)
df['pt82'] = np.load(EB_BODIES_PSEUDO_82)
df[if n_sample != 'all' and not phate:
= df.sample(n=n_sample,random_state=random_state)
df
if phate:
from phate import PHATE
= PHATE(phate_dims, n_jobs=-1)
phate_operator = df.sample(n=n_sample,random_state=random_state)
sub_sample = phate_operator.fit_transform(sub_sample[[f'd{i}' for i in range(1, 11)]])
Y_phate = pd.concat([
df =['samples']),
pd.DataFrame(df.samples.values, columns=list(map(lambda e: f'd{e}', range(1, phate_dims+1))))
pd.DataFrame(Y_phate, columns'pt4'], df['pt6'], df['pt25']], axis=1)
, df[return df
#export
from MIOFlow.constants import (DYNGEN_INFO_FILE, DYNGEN_EXPR_FILE)
from phate import PHATE
import warnings
def make_dyngen_data(
='sim_time', phate_dims=10, round_labels=True,
time_colbool=False, add_noise=False, add_noise_after_phate=False,
use_gaussian:float=1, scale_phate=100, n_bins=5, column='d1'
scale_factor:
):= 'simulation_i step_ix sim_time'.split()
_valid if time_col not in _valid:
= _valid[0]
time_col
= np.random.randn if use_gaussian else np.random.rand
noise_fn
= pd.read_csv(DYNGEN_EXPR_FILE, )
exp
if add_noise and not add_noise_after_phate:
= noise_fn(*exp.shape) / scale_factor
noise += noise
exp
= pd.read_csv(DYNGEN_INFO_FILE, skipfooter=1, engine='python').dropna(axis=1)
ids = pd.concat([ids, exp], axis=1).set_index('cell_id')
df 'samples'] = df[time_col]
df[= df.drop(columns=_valid)
df
= PHATE(phate_dims, n_jobs=-1)
phate_operator = phate_operator.fit_transform(df.drop(columns=['samples']))
Y_phate
*= scale_phate
Y_phate
if add_noise and add_noise_after_phate:
= noise_fn(*Y_phate.shape) / scale_factor
noise += noise
Y_phate
= pd.concat([
df =['samples']),
pd.DataFrame(df.samples.values, columns=list(map(lambda e: f'd{e}', range(1, phate_dims+1))))
pd.DataFrame(Y_phate, columns=1)
], axis
if round_labels:
# instead of 0 - 1000 ----> 0 - 10
= np.round(df.samples, -2) / 100
df.samples
= relabel_data(df, min_bin=0, n_bins=n_bins, column=column)
df
if phate_dims in [2,5]:
= (df['d1'] <= -2.0)
locs 'samples'] = -1
df.loc[locs, 'samples'] == -1].index, inplace = True)
df.drop(df[df[elif phate_dims in [10,15,30,40,60]:
= (df['d1'] <= -1.9)
locs 'samples'] = -1
df.loc[locs, 'samples'] == -1].index, inplace = True)
df.drop(df[df[else:
'Not tested for this \'phate_dims\', using the same threshold as the one from \'[10,15,30,40,60]\' dims.')
warnings.warn(= (df['d1'] <= -1.9)
locs 'samples'] = -1
df.loc[locs, 'samples'] == -1].index, inplace = True)
df.drop(df[df[
return df
def relabel_data(df,min_bin=0, n_bins=10, column='d1', samples_key='samples'):
= df.copy()
dff
= np.min(dff[column])
x_min = np.max(dff[column])
x_max
= np.linspace(x_min, x_max, n_bins+1)
parts = list(range(min_bin, n_bins+1, 1))
value for i, x in list(zip(value, parts))[::-1]:
if i == 0:
continue
= (dff[column] <= x)
locs = i
dff.loc[locs, samples_key] return dff
# #export
# def relabel_data(df, n_bins=10, column='d1', samples_key='samples'):
# dff = df.copy()
# x_min = np.min(dff[column])
# x_max = np.max(dff[column])
# parts = np.linspace(x_min, x_max, n_bins+1)
# value = list(range(0, n_bins+1, 1))
# for i, x in list(zip(value, parts))[::-1]:
# if i == 0:
# continue
# locs = (dff[column] <= x)
# dff.loc[locs, samples_key] = i
# return dff
#export
import numpy as np, seaborn as sns, pandas as pd, matplotlib.pyplot as plt
def rings(
int, M:int = None,
N:float = 1,
data_scale:bool = True,
add_noise:float = 0.7,
noise_scale_theta:float = 0.03,
noise_scale_radius:buffer:float = 0.8,
**kwargs
-> (np.ndarray, np.ndarray):
) '''
Arguments:
N (int): Number of points to make.
M (int): Defaults to `None`. If `M='auto'` will automatically determine how many circles to make.
data_scale (float): Defaults to `1`. Multiplier to rescale the data.
add_noise (bool): Defaults to `True`. Whether or not to add noise to the data.
noise_scale_theta (float): Defaults to `0.7`. How much to scale the noise added to `theta`.
noise_scale_radius (float): Defaults to `0.3`. How much to scale the noise added to `radius`.
buffer (float): Defaults to `0.8`. How much to scale the `radius` to add some padding between circles.
**kwargs
Returns:
X (np.ndarray): The x, y coordinates for the points.
C (np.ndarray): The cluster number of each point.
'''
"""Generate petal data set."""
= [] # points in respective petals
X = [] # auxiliary array (points on outer circle)
Y = []
C
assert N > 4, "Require more than four data points"
# Number of 'petals' to point into the data set. This is required to
# ensure that the full space is used.
if M is None:
= int(np.floor(np.sqrt(N)))
M = np.linspace(0, 2 * np.pi, M, endpoint=False)
thetas
for theta in thetas:
Y.append(np.asarray([np.cos(theta), np.sin(theta)]))
# Radius of the smaller cycles is half of the chord distance between
# two 'consecutive' points on the circle.
= 0.5 * np.linalg.norm(Y[0] - Y[1])
radius
for i, x in enumerate(Y):
for theta in thetas:
for j in range(N // M // len(thetas)):
= radius if not add_noise else radius + np.random.randn() * noise_scale_radius
r = theta if not add_noise else theta + np.random.randn() * noise_scale_theta
t *= buffer
r * np.cos(t) - x[0], r * np.sin(t) - x[1]]))
X.append(np.asarray([r
# Indicates that this point belongs to the $i$th circle.
C.append(i)= np.asarray(X)
X = np.asarray(C)
C *= data_scale
X return X, C
def make_rings(N:int, M:int = None,
float = 1,
data_scale:bool = True,
add_noise:float = 0.7,
noise_scale_theta:float = 0.03,
noise_scale_radius:buffer:float = 0.8,
**kwargs
-> pd.DataFrame:
) '''
Arguments:
N (int): Number of points to make.
M (int): Defaults to `None`. If `M='auto'` will automatically determine how many circles to make.
data_scale (float): Defaults to `1`. Multiplier to rescale the data.
add_noise (bool): Defaults to `True`. Whether or not to add noise to the data.
noise_scale_theta (float): Defaults to `0.7`. How much to scale the noise added to `theta`.
noise_scale_radius (float): Defaults to `0.3`. How much to scale the noise added to `radius`.
buffer (float): Defaults to `0.8`. How much to scale the `radius` to add some padding between circles.
**kwargs
Returns:
X (np.ndarray): The x, y coordinates for the points.
C (np.ndarray): The cluster number of each point.
'''
= rings(N, M, data_scale, add_noise, noise_scale_theta, noise_scale_radius, buffer)
x, c = pd.DataFrame(x, columns=[f'd{i+1}' for i in range(x.shape[1])])
df 'samples'] = c
df['samples')
df.set_index(return df
= make_rings(
df 2000, 6,
= 5,
data_scale = True, noise_scale_theta = 0.7,
add_noise = 0.03,
noise_scale_radius buffer = 0.8,
)
df.head()
=(12, 8))
plt.figure(figsize
sns.scatterplot(=df, x='d1', y='d2',
data='samples', palette='viridis',
hue='samples', sizes=(100, 100),
size )
<AxesSubplot:xlabel='d1', ylabel='d2'>
#export
def make_jacks(
= 3,
n_axes = 1000,
points = 'axis',
label_by = 3,
n_classes = True,
use_neg = 3,
data_scale = True,
add_noise = 0.03,
noise_scale
):
= 'axis coord'.split()
_valid_label_bys
if label_by not in _valid_label_bys:
= _valid_label_bys[0]
label_by
= []
results = []
classes
= np.eye(n_axes)
axes
for i, axis in enumerate(axes):
= np.linspace(0, 1, points // n_axes).reshape(-1, 1)
segment if add_noise:
= axis * (segment + np.random.randn(segment.size, 1) * noise_scale)
coordinates else:
= axis * segment
coordinates
results.extend(coordinates.tolist())
if label_by == 'axis':
= [i for j in range(len(segment))]
labels
classes.extend(labels)elif label_by == 'coord':
= [
labels for k in range(n_classes)
k for j in range(points // n_axes // n_classes)
]for j in range(len(segment) - len(labels)):
- 1)
labels.append(n_classes
classes.extend(labels)
if use_neg:
for i, axis in enumerate(axes):
= np.linspace(0, 1, points // n_axes).reshape(-1, 1) * -1
segment if add_noise:
= axis * (segment + np.random.randn(segment.size, 1) * noise_scale)
coordinates else:
= axis * segment
coordinates
results.extend(coordinates.tolist())
if label_by == 'axis':
= [n_axes + i for j in range(len(segment))]
labels
classes.extend(labels)elif label_by == 'coord':
= [
labels for k in range(n_classes)
k for j in range(points // n_axes // n_classes)
]for j in range(len(segment) - len(labels)):
- 1)
labels.append(n_classes
classes.extend(labels)
= np.array(results) + np.random.randn(len(results), n_axes) * noise_scale
results *= data_scale
results = pd.DataFrame(results, columns=[f'd{i+1}' for i in range(n_axes)])
df 'samples'] = classes
df['samples')
df.set_index(return df
= make_jacks(
df = 3,
n_axes = 1000,
points = 'axis',
label_by = 3,
n_classes = True,
use_neg = 3,
data_scale = True,
add_noise = 0.03,
noise_scale
)
= plt.figure(figsize=(12, 8))
fig = plt.axes(projection='3d')
ax =df.samples) ax.scatter(df.d1, df.d2, df.d3, c
<mpl_toolkits.mplot3d.art3d.Path3DCollection>