Pipeline clustering yml
Download this file
# Pipeline pipeline_clustering_scanpy.py configuration file
# ==============================================
# compute resource options
# ------------------------
resources:
# Number of threads used for parallel jobs
# this must be enough memory to load your mudata and do computationally intensive tasks
threads_high: 2
# this must be enough memory to load your mudata and do computationally light tasks
threads_medium: 2
# this must be enough memory to load text files and do plotting, required much less memory than the other two
threads_low: 2
fewer_jobs: True
# Add path to conda env, leave blank if running native or your cluster automatically inherits the login node environment
condaenv:
# Start
# --------------------------
sample_prefix: teaseq
scaled_obj: ../teaseq_temp.h5mu
# full obj only applicable if you have filtered your scaled object by hvgs
# in this case panpipes will use the full obj to calculate rank_gene_groups and for plotting those genes
# it should contain all the genes you want to include in rank_gene_groups, plus logged_counts as a layer
# if your scaled_obj contains all the genes then leave full_obj blank
full_obj:
# run clustering on each individual modality:
modalities:
rna: True
prot: True
atac: True
# if True, will look for WNN, or totalVI output
multimodal:
run_clustering: True
#WNN, mofa, totalVI # this will tell us where to look for
integration_method: wnn
# batch_correction: harmony # None, harmony, scanorama, bbknn or combat
# ---------------------------------------
# parameters for find neighbours
# ---------------------------------------
# find neighbour parameters
#-----------------------------
# number of neighbors to use when calculating the graph for clustering and umap.
neighbors:
rna:
use_existing: True
# number of Principal Components to calculate for neighbours and umap:
dim_red: X_pca
#how many components to use for clustering
n_dim_red: 30
# number of neighbours
k: 30
# metric: euclidean | cosine
metric: euclidean
# scanpy | hnsw (from scvelo)
method: scanpy
prot:
use_existing: True
# number of Principal Components to calculate for neighbours and umap:
dim_red: X_pca
#how many components to use for clustering
n_dim_red: 30
# number of neighbours
k: 30
# metric: euclidean | cosine
metric: euclidean
# scanpy | hnsw (from scvelo)
method: scanpy
atac:
use_existing: True
# number of Principal Components to calculate for neighbours and umap:
dim_red: X_pca
#how many components to use for clustering
n_dim_red: 30
# number of neighbours
k: 30
# metric: euclidean | cosine
metric: euclidean
# scanpy | hnsw (from scvelo)
method: scanpy
# ---------------------------------------
# parameters for umap calculation
# ---------------------------------------
umap:
# set run to False if you are happy with the existing umap from integration
run: True
rna:
mindist:
- 0.25
- 0.5
prot:
mindist:
- 0.1
atac:
mindist:
- 0.5
multimodal:
mindist:
- 0.5
# UMAP reduced dimensions will be stored using the format
# ---------------------------------------
# parameters for clustering
# ---------------------------------------
clusterspecs:
rna:
resolutions:
- 0.2
- 0.6
- 1
algorithm: leiden # (louvain or leiden)
prot:
resolutions:
- 0.2
- 0.6
- 1
algorithm: leiden # (louvain or leiden)
atac:
resolutions:
- 0.2
- 0.3
algorithm: leiden # (louvain or leiden)
multimodal:
resolutions:
- 0.5
- 0.7
algorithm: leiden
# ---------------------------------------
# parameters for finding marker genes
# ---------------------------------------
# where pseudo_suerat is set to False
# we run https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.rank_genes_groups.html
# where pseudo_seurat is set to True, we run an a python implementation of Seurat::FindMarkers (written by CRG) is used,
markerspecs:
rna:
run: True
layer: logged_counts
# method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]]
method: t-test_overestim_var
mincells: 10 # if a cluster contains less than n cells then do not bother doing marker analysis
# where pseudo_suerat is set to False
# we run https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.rank_genes_groups.html
# where pseudo_seurat is set to True, we run an a python implementation of Seurat::FindMarkers (written by CRG) is used,
pseudo_seurat: False
# these next two settings do not matter unless pseudo_seurat is set to True,
# If applicable look at Seurat documentation for FindMarkers for details
minpct: 0.1
threshuse: 0.25
prot:
run:
layer: clr
mincells: 10 # if a cluster contains less than n cells then do not bother doing marker analysis
# method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]]
method: wilcoxon
pseudo_seurat: False
minpct: 0.1
threshuse: 0.25
atac:
run: True
layer: signac_norm
mincells: 10
# method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]]
method: wilcoxon
pseudo_seurat: False
minpct: 0.1
threshuse: 0.25
multimodal:
mincells: 10
# method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]]
method: wilcoxon
pseudo_seurat: False
minpct: 0.1
threshuse: 0.25
# ---------------------------------------
# plot specs are used to define which metadata columns are used in the visualisations
# ---------------------------------------
plotspecs:
layers:
rna: logged_counts
prot: clr
atac: signac_norm
top_n_markers: 10