Pipeline ingest yaml - multiome data

Download this file
# ============================================================
# Ingest workflow Panpipes (pipeline_ingest.py)
# ============================================================
# This file contains the parameters for the ingest workflow.
# For full descriptions of the parameters, see the documentation at https://panpipes-pipelines.readthedocs.io/en/latest/yaml_docs/pipeline_ingestion_yml.html


#--------------------------
# Compute resources options
#--------------------------
resources:
  threads_high: 8
  threads_medium: 4
  threads_low: 2

 # Path to conda env, leave blank if running native or your cluster automatically inherits the login node environment
condaenv: 

# --------------------------------
# Loading and merging data options
# --------------------------------

# ----------------------------
# Project name and data format
project: "mome"
sample_prefix: "mome"
use_existing_h5mu: False
submission_file: multiomecaf.txt
metadatacols:
concat_join_type: inner

#--------------------------
# Modalities in the project
modalities:
  rna: True
  prot: False
  bcr: False
  tcr: False
  atac: True

#--------------------------------
# Integrating barcode level data
# e.g. demultiplexing with hashtags, chemical tags or lipid tagging
barcode_mtd:
  include: False
  path:
  metadatacols:  

#------------------------------------------
# Loading Protein data - additional options
protein_metadata_table:
index_col_choice:
load_prot_from_raw: False
subset_prot_barcodes_to_rna: False


# -----------------------------
# Quality Control (QC) options
# -----------------------------

# -----------------------------------
# Processing of 10X cellranger metrics files
plot_10X_metrics: True

# ----------------------------------
# Doublet detection on RNA modality
scr:
  run: True
  expected_doublet_rate: 0.06
  sim_doublet_ratio: 2
  n_neighbours: 20
  min_counts: 2
  min_cells: 3
  min_gene_variability_pctl: 85
  n_prin_comps: 30
  use_thr: True
  call_doublets_thr: 0.25

# ----------------------------
# RNA modality Quality Control

# Providing a gene list
# see documentation at https://panpipes-pipelines.readthedocs.io/en/latest/usage/gene_list_format.html
custom_genes_file: panpipes-tutorials/tutorials/ingesting_data/qc_genelist_1.0.csv

# Defining actions on the genes

# (for pipeline_ingest.py)
calc_proportions: hb,mt,rp,ig
score_genes: MarkersNeutro

# cell cycle action
ccgenes: default

# ------------------------
# Plotting RNA QC metrics
# all metrics should be provided as a comma separated string e.g. a,b,c
plotqc_grouping_var: sample_id
plotqc_rna_metrics: doublet_scores,pct_counts_mt,pct_counts_rp,pct_counts_hb,pct_counts_ig

# ----------------------------
# Plotting Protein QC metrics

# requires prot_path to be included in the submission file
# all metrics should be provided as a comma separated string e.g. a,b,c
plotqc_prot_metrics: total_counts,log1p_total_counts,n_prot_by_counts,pct_counts_isotype
plot_metrics_per_prot: total_counts,log1p_total_counts,n_cells_by_counts,mean_counts

identify_isotype_outliers: True
isotype_upper_quantile: 90
isotype_n_pass: 2

# ---------------------
# Plot ATAC QC metrics

# set is_paired to True if a multiome is ingested
is_paired: True
# If this is NOT a multiome experiment, but you have an RNA anndata that you would like to use for TSS enrichment
# use the partner_rna to specify the path to the file and provide a features_tss file with the tss coordinates
# leave empty if multiome is used
partner_rna:
features_tss:
plotqc_atac_metrics: n_genes_by_counts,total_counts,pct_fragments_in_peaks,atac_peak_region_fragments,atac_mitochondrial_reads,atac_TSS_fragments

# ---------------------------
# Plot Repertoire QC metrics
ir_dist:
  metric:
  sequence:

clonotype_definition:
  receptor_arms:
  dual_ir:
  within_group:

plotqc_rep_metrics:
# provide a item list
 - is_cell
 - extra_chains
 - clonal_expansion
 - rep:receptor_type
 - rep:receptor_subtype
 - rep:chain_pairing
 - rep:multi_chain


# -------------------------------------
# Profiling Protein Ambient background
# -------------------------------------
# PLEASE NOTE that this analysis can only be run if your inputs are from cellranger raw outputs

assess_background: False
downsample_background: True

# -----------------------------------------------------
# Files required for profiling ambient background or running dsb normalisation

# The pipeline requires the raw_feature_bc_matrix folder from cellranger or equivalent,
# specified in the submission file path with {mod}_filetype set to "cellranger," "cellranger_multi," or "10X_h5"
# for automatic search of .h5 or matrix folder for profiling ambient background or running dsb normalization.

#-------------------------------------------
# Investigate per-channel antibody staining
channel_col: sample_id
save_norm_prot_mtx: False


#----------------------
# Protein normalization
#----------------------

normalisation_methods: clr

#-----------------------------------------------
# Centered log ratio (CLR) normalization options

# margin determines whether you normalise per cell (as you would for RNA),
# or by feature (recommended, due to the variable nature of prot assays).
# CLR margin 0 is recommended for informative qc plots in this pipeline
# 0 = normalise row-wise (per cell)
# 1 = normalise column-wise (per feature)
clr_margin: 0

#--------------------------------------------------------------
# Denoised and Scaled by Background (DSB) normalization options
quantile_clipping: True