Example Datasets

[1]:

import os
import warnings

os.environ["OMP_NUM_THREADS"] = "4"
os.environ["OPENBLAS_NUM_THREADS"] = "4"
os.environ["MKL_NUM_THREADS"] = "4"
os.environ["VECLIB_MAXIMUM_THREADS"] = "4"
os.environ["NUMEXPR_NUM_THREADS"] = "4"

from grnet.toydata import load_dataset, load_metadata
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.manifold import TSNE

warnings.filterwarnings("ignore", category=RuntimeWarning, module="threadpoolctl")

Example datasets include followings: - Count data (\(\log_2(RPM+1)\)) - metadata

Prototype1

[2]:

data1 = load_dataset("prototype1")
meta1 = load_metadata("prototype1")

NxD matrix will be given (N: number of samples, D: number of genes)

[3]:

data1

[3]:

	gene_1	gene_2	gene_3	gene_4	gene_5	gene_6	gene_7	gene_8	gene_9	gene_10	...	gene_9991	gene_9992	gene_9993	gene_9994	gene_9995	gene_9996	gene_9997	gene_9998	gene_9999	gene_10000
sample_1	0.0	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	...	2.753703	0.000000	0.000000	0.000000	0.000000	10.273207	0.0	0.0	0.0	0.000000
sample_2	0.0	0.0	2.380056	0.0	0.000000	1.633563	9.828775	0.000000	0.000000	0.0	...	0.000000	2.869546	0.000000	4.155610	0.000000	9.673981	0.0	0.0	0.0	0.000000
sample_3	0.0	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	...	0.000000	0.000000	0.000000	2.083379	0.000000	9.719177	0.0	0.0	0.0	0.000000
sample_4	0.0	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	3.832107	0.0	...	8.609340	6.330085	0.000000	0.000000	3.939771	10.232104	0.0	0.0	0.0	0.000000
sample_5	0.0	0.0	0.000000	0.0	0.000000	0.000000	0.000000	3.034451	0.000000	0.0	...	0.000000	0.000000	0.000000	0.000000	0.000000	10.091159	0.0	0.0	0.0	0.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
sample_996	0.0	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	3.380126	0.0	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.0	0.0	2.048660
sample_997	0.0	0.0	0.000000	0.0	0.000000	0.000000	0.000000	2.194422	0.000000	0.0	...	0.000000	0.000000	4.489439	0.000000	0.000000	0.000000	0.0	0.0	0.0	3.552277
sample_998	0.0	0.0	0.000000	0.0	8.068697	0.000000	0.000000	0.000000	0.000000	0.0	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.0	0.0	4.741316
sample_999	0.0	0.0	0.000000	0.0	1.796236	0.000000	2.571982	0.000000	0.000000	0.0	...	0.000000	0.000000	0.000000	0.000000	3.740458	0.000000	0.0	0.0	0.0	3.985389
sample_1000	0.0	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	...	0.000000	7.084269	0.000000	0.000000	0.000000	0.000000	0.0	0.0	0.0	9.423962

1000 rows × 10000 columns

in metadata, cluster info. is provided

[4]:

meta1

[4]:

	cluster
sample_1	1
sample_2	1
sample_3	1
sample_4	1
sample_5	1
...	...
sample_996	5
sample_997	5
sample_998	5
sample_999	5
sample_1000	5

1000 rows × 1 columns

Visualization

[5]:

# dimensionality reduction with TSNE

tsne1 = pd.DataFrame(
    TSNE(n_components=2, random_state=0).fit_transform(data1),
    index = data1.index,
    columns = [f"TSNE{i + 1}" for i in range(2)]
)

[6]:

fig, ax = plt.subplots(figsize=(4, 4))

for i, v in enumerate(meta1.cluster.unique()):
    _dat = tsne1[meta1.cluster == v]

    plt.scatter(
        _dat.iloc[:, 0], _dat.iloc[:, 1],
        color = plt.cm.rainbow(
            i / len(meta1.cluster.unique())
        ),
        label = f"cluster_{v}"
    )

ax.legend(loc="center left", bbox_to_anchor=(1, .5), frameon=False)
ax.axis("off")
ax.set(title="prototype1");

../_images/notebooks_example_data_11_0.png

Prototype2

[7]:

data2 = load_dataset("prototype2")
meta2 = load_metadata("prototype2")

[8]:

data2

[8]:

	gene_1	gene_2	gene_3	gene_4	gene_5	gene_6	gene_7	gene_8	gene_9	gene_10	...	gene_9991	gene_9992	gene_9993	gene_9994	gene_9995	gene_9996	gene_9997	gene_9998	gene_9999	gene_10000
sample_1	0.000000	3.507129	0.000000	0.000000	0.0	0.000000	0.0	0.0	0.0	0.000000	...	0.000000	0.000000	3.507129	0.000000	0.00000	0.0	8.004430	0.000000	0.000000	0.000000
sample_2	0.000000	0.000000	0.000000	0.000000	0.0	1.715359	0.0	0.0	0.0	0.000000	...	0.000000	2.477047	4.707172	0.000000	2.97294	0.0	8.314904	0.000000	0.000000	0.000000
sample_3	4.567996	0.000000	0.000000	0.000000	0.0	0.000000	0.0	0.0	0.0	0.000000	...	0.000000	0.000000	4.567996	1.941078	0.00000	0.0	7.904205	0.000000	0.000000	0.000000
sample_4	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.0	0.0	0.0	0.000000	...	0.000000	0.000000	4.320645	0.000000	0.00000	0.0	8.602809	0.000000	0.000000	0.000000
sample_5	0.000000	8.222258	0.000000	0.000000	0.0	0.000000	0.0	0.0	0.0	0.000000	...	0.000000	0.000000	4.526527	0.000000	0.00000	0.0	7.378120	0.000000	0.000000	0.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
sample_1496	3.786429	0.000000	0.000000	0.000000	0.0	0.000000	0.0	0.0	0.0	0.000000	...	0.000000	3.169765	0.000000	0.000000	0.00000	0.0	0.000000	0.000000	8.647279	1.378401
sample_1497	6.193268	0.000000	0.000000	0.000000	0.0	0.000000	0.0	0.0	0.0	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.00000	0.0	3.366368	0.000000	10.656825	0.000000
sample_1498	6.028006	3.371017	0.000000	3.001727	0.0	0.000000	0.0	0.0	0.0	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.00000	0.0	3.371017	0.000000	10.925548	0.000000
sample_1499	5.144795	0.000000	1.595626	2.821051	0.0	0.000000	0.0	0.0	0.0	8.670563	...	2.334715	0.000000	0.000000	0.000000	0.00000	0.0	3.473954	3.715184	9.994661	0.000000
sample_1500	4.536216	0.000000	0.000000	0.000000	0.0	0.000000	0.0	0.0	0.0	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.00000	0.0	4.205152	0.000000	9.952163	0.000000

1500 rows × 10000 columns

[9]:

meta2

[9]:

	cluster
sample_1	1
sample_2	1
sample_3	1
sample_4	1
sample_5	1
...	...
sample_1496	9
sample_1497	9
sample_1498	9
sample_1499	9
sample_1500	9

1500 rows × 1 columns

Visualization

[10]:

# dimensionality reduction with TSNE

tsne2 = pd.DataFrame(
    TSNE(n_components=2, random_state=0).fit_transform(data2),
    index = data2.index,
    columns = [f"TSNE{i + 1}" for i in range(2)]
)

[11]:

fig, ax = plt.subplots(figsize=(4, 4))

for i, v in enumerate(meta2.cluster.unique()):
    _dat = tsne2[meta2.cluster == v]

    plt.scatter(
        _dat.iloc[:, 0], _dat.iloc[:, 1],
        color = plt.cm.hsv(
            i / len(meta2.cluster.unique())
        ),
        label = f"cluster_{v}"
    )

ax.legend(loc="center left", bbox_to_anchor=(1, .5), frameon=False)
ax.axis("off")
ax.set(title="prototype2");

../_images/notebooks_example_data_18_0.png