Pipeline refmap yaml

Download this file
# Pipeline pipeline_refmap.py configuration file
# ==============================================

# compute resource options
# ------------------------
# customize this depending on the cluster you're using
resources:
  # Number of threads used for parallel jobs
  # this must be enough memory to load your mudata and do computationally intensive tasks
  threads_high: 1
  # this must be enough memory to load your mudata and do computationally light tasks
  threads_medium: 1
  # this must be enough memory to load text files and do plotting, requires much less memory than the other two
  threads_low: 1
# path to conda env, leave blank if running native or your cluster automatically inherits the login node environment

condaenv: 


# allows for tweaking which queues jobs get submitted to, 
# in case there is a special queue for long jobs or you have access to a gpu-sepcific queue
# the default queue should be specified in your .cgat.yml file
# leave as is if you do not want to use the alternative queues
queues:
  long: 
  gpu:  

#----------------------
# query dataset
#----------------------
query: data.dir/pancreas_querydata.h5ad
# we support raw10x formats, or preprocessed quality filtered mudata or anndata as input query
modality: rna
# if you supplied a mu data, specify the modality to be used
# currently only RNA query supported
# does your query data have a batch effect, what column?
query_batch:
# does your query have a celltype annotation you want to use to compare to
# the transferred label?
# leave empty if not
query_celltype: celltype
#----------------------
# scvi tools params
#----------------------
# specify one or more reference models that you would like to use as reference
# you can use your own reference that you built using pipeline_integration here
# leave blank for no model specification

# specify the reference anndata/mudata with RNA. you need this only if you want to calc umap on both reference and query data.
# or leave blank and only provide model paths
reference_data: data.dir/pancreas_refdata.h5mu
# path to tovalvi saved model
totalvi: 
impute_proteins: True
transform_batch:
# if you did not run scanvi, and you want to simply map query onto reference, 
# path_to_scvi will be used to initialize scanvi for label transfer.
# if you have an initialized scanvi model instead, add it here
scvi:
  - /Users/fabiola.curion/Documents/devel/data_test.dir/data_for_scvi_test/pancreas_model/model.pt
scanvi: 
  - /Users/fabiola.curion/Documents/devel/data_test.dir/data_for_scvi_test/pancreas_model_scanvi/model.pt
# for scvi and totalvi, if the reference model has a trained random forest classifier, use to
# classify cells in the query
run_randomforest: False



#----------------------
# training params
#----------------------

# to reuse these params, (for example for WNN) please use anchors (&) and scalars (*) in the relevant place
# i.e. &rna_neighbors will be called by *rna_neighbors where referenced

training_plan:
  totalvi: &totalvitraining 
    # example params here https://docs.scvi-tools.org/en/0.14.1/api/reference/scvi.model.TOTALVI.train.html
    max_epochs: 200
      # recommended weight decay is 0.0
      # This ensures the latent representation of the reference cells will remain exactly the same if passing them through this new query model.
    weight_decay: 0.0
  scvi: *totalvitraining
  scanvi: *totalvitraining
#----------------------
# neighbors params to calculate umaps on either query alone, or query+ reference dataset
#----------------------

neighbors: 
  # number of Principal Components to calculate for neighbours and umap:
  #   -if no correction is applied, PCA will be calculated and used to run UMAP and clustering on
  #   -if Harmony is the method of choice, it will use these components to create a corrected dim red.)
  # note: scvelo default is 30
    npcs: 30
    # number of neighbours
    k: 30
    # metric: euclidean | cosine
    metric: euclidean
    # scanpy | hnsw (from scvelo)
    method: scanpy


#----------------------
# Run scib metrics on query, specify params:
#----------------------
# see documentation for the metrics used at https://scib.readthedocs.io/en/latest/
# running scib on query data after trasferring labels, where available (totalvi and scanvi), or using default leiden clustering after training the vae model (scvi)
scib:
  run: True
  #used for ARI and NMI (if left empty will default to leiden clustering calculated on the new latent representation after reference mapping)
  cluster_key: 
  #used for clisi_graph_embed (if left empty will default to query_batch specified above)
  batch_key:
  #ground truth label (if left empty it will default to query_celltype specified above )
  celltype_key: