Pipeline Integration yml

Download this file

# ============================================================
# Integration workflow Panpipes (pipeline_integration.py)
# ============================================================
# This file contains the parameters for the integration workflow.
# For full descriptions of the parameters, see the documentation at https://panpipes-pipelines.readthedocs.io/en/latest/yaml_docs/pipeline_integration_yml.html


#--------------------------
# Compute resources options
#--------------------------
resources:
  threads_high: 1
  threads_medium: 1
  threads_low: 1
  threads_gpu: 2

  
# Path to conda env, leave blank if running native or your cluster automatically inherits the login node environment
condaenv:

queues:
  long: 
  gpu:  

# --------------------------------
# Loading and merging data options
# --------------------------------

# ----------------------------
# Data format
sample_prefix: teaseq
preprocessed_obj: teaseq.h5mu


#-----------------
# Batch correction
# ----------------
# Batch correction is done unimodal, meaning each modality is batch corrected independently

# ------------
# RNA modality
rna:
  run: True
  tools: harmony,scvi,bbknn
  column: dataset

  # Harmony arguments
  harmony:
    sigma: 0.1
    theta: 1.0
    npcs: 30

  # BBKNN args # https://bbknn.readthedocs.io/en/latest/
  bbknn:
    neighbors_within_batch: 20

  # SCVI args
  scvi:
    seed: 1492
    exclude_mt_genes: True
    mt_column: mt
    model_args:
        n_layers: 
        n_latent:
        gene_likelihood: zinb
    training_args:
        max_epochs: 400
        train_size: 0.9
        early_stopping: True
    training_plan: 
        lr: 0.001
        n_epochs_kl_warmup: 400
        reduce_lr_on_plateau: True
        lr_scheduler_metric: 
        lr_patience: 8
        lr_factor: 0.1

  # Find neighbour parameters
  neighbors: &rna_neighbors
    npcs: 30
    k: 30
    metric: euclidean
    method: scanpy

# ----------------
# Protein modality
prot:
  run: True
  tools: harmony
  column: orig.ident 

  # Harmony args
  harmony:
    sigma: 0.1
    theta: 1.0
    npcs: 30

  # BBKNN args # https://bbknn.readthedocs.io/en/latest/
  bbknn:
    neighbors_within_batch: 20

  # Find neighbour parameters
  neighbors: &prot_neighbors
    npcs: 30
    k: 30
    metric: euclidean
    method: scanpy

# -------------
# ATAC modality
atac:
  run: True
  dimred: LSI
  tools: harmony,bbknn
  column: dataset

  # Harmony args
  harmony:
    sigma: 0.1
    theta: 1.0
    npcs: 30

  # BBKNN args # https://bbknn.readthedocs.io/en/latest/
  bbknn:
    neighbors_within_batch: 

  # Find neighbour parameters
  neighbors: &atac_neighbors
    npcs: 30
    k: 30
    metric: euclidean
    method: scanpy


#-----------------------
# multimodal integration
# ----------------------
# remember to specify knn graph params in the section "neighbors"
multimodal:
  run: True
  tools: 
    - WNN
    - totalvi
  column_categorical: sample_id 

  # TotalVI arguments
  totalvi:
    seed: 1492
    modalities: rna,prot
    exclude_mt_genes: True
    mt_column: mt
    filter_by_hvg: True
    filter_prot_outliers: False
    model_args: 
      latent_distribution: "normal"
    training_args:
      max_epochs: 100
      train_size: 0.9
      early_stopping: True
    training_plan: None

  # MultiVI arguments
  MultiVI:
    seed: 1492
    lowmem: True
    model_args:
      n_hidden :
      n_latent :
      region_factors : True
      latent_distribution : 'normal'
      deeply_inject_covariates : False
      fully_paired : False

    training_args:
      max_epochs : 500
      lr : 0.0001
      use_gpu :
      train_size : 0.9
      validation_size :
      batch_size : 128
      weight_decay : 0.001
      eps : 1e-08
      early_stopping : True
      save_best : True
      check_val_every_n_epoch :
      n_steps_kl_warmup :
      n_epochs_kl_warmup : 50
      adversarial_mixing : True
    training_plan :

  # Mofa arguments
  mofa:
    modalities: 
    filter_by_hvg: True
    n_factors: 10
    n_iterations: 1000
    convergence_mode: fast
    save_parameters: False
    outfile:

  # WNN arguments
  WNN:
    modalities: rna,prot,atac 
    batch_corrected:
      rna: None
      prot: None
      atac: None

    # please use anchors (&) and scalars (*) if necessary
    knn:
      rna: *rna_neighbors
      prot: *prot_neighbors
      atac: *atac_neighbors

    # WNN neighbour search
    n_neighbors:
    n_bandwidth_neighbors: 20
    n_multineighbors: 200
    metric: 'euclidean'
    low_memory: True
  
  # KNN calculation for multimodal analysis
  neighbors:
    npcs: 30
    k: 30
    metric: euclidean
    method: scanpy


#--------------------
# Plotting parameters
#--------------------
plotqc:
  grouping_var: dataset,sample_id

  all: rep:receptor_subtype
  rna: rna:total_counts
  prot: prot:total_counts
  atac:
  multimodal: rna:total_counts


#-------------
# scib metrics
#-------------
#Obs columns containing the cell type labels
scib:
  rna:
  prot:
  atac:


# -------------------------
# Creating the final object
# -------------------------
final_obj:
  rna:
    include: True
    bc_choice: no_correction
  prot:
    include: True
    bc_choice: harmony
  atac:
    include: False
    bc_choice: bbknn
  multimodal:
    include: True
    bc_choice: WNN