Getting started

Some output will be written during the tutorials, a temporary directory will be used for that:

outdir <- tempdir()
Sys.setenv(rvat = system.file('exec/rvat.R', package = 'rvat'),
           vcfpath = rvatData::rvat_example('rvatData.vcf.gz'),
           phenopath = rvatData::rvat_example('rvatData.pheno'),
           varinfopath = rvatData::rvat_example('rvatData.varinfo'),
           outdir = outdir
           )

Identify RVAT executable path:

#rvat=$(Rscript -e "cat(system.file('exec/rvat.R', package = 'rvat'))")

Main help page:

Rscript $rvat --help
## Rare variant analysis toolkit (rvat)
## 
## Error: Please provide a valid function call.
## 
## Provide a valid function call for detailed help.
## --buildGdb
## --concatGdb
## --uploadAnno
## --mapVariants
## --uploadCohort
## --listAnno
## --listCohort
## --dropTable
## --subsetGdb
## --buildVarSet
## --spatialClust
## --summariseGeno
## --aggregate
## --mergeAggregateFiles
## --collapseAggregateFiles
## --vcfInfo2Table
## --assocTest
## --buildResamplingFile
## --buildGeneSet
## --buildCorMatrix
## --geneSetAssoc
## Error:                                                                                                                                                              
## Execution halted

Function specific help page:

Rscript $rvat --buildGdb
## 
## buildGdb
## 
##   Creates a new gdb file. The gdb can be structured and populated using a provided vcf file. 
## 
## Usage:
##   Rscript rvat.R --buildGdb --vcf={vcf} --output={output} [options]
## 
## Arguments:
##   output           Path for output gdb file
##   vcf              Input vcf file used to structure and populate gdb. Warning this function makes the following of assumptions: 1) strict adherence to vcf format (GT subfield first element in genotype firelds), 2) multiallelic records have been split, 3) desired genotype QC has already been applied (DP,GQ filters), 4) GT values conform to the set {0/0,0/1,1/0,1/1,./.,0|0,0|1,1|0,1|1,.|.}. Multiallelic parsing and genotype QC can be performed using vcftools and/or accompanying parser scripts included on the rvat github.
##   skipIndexes      Flag to skip generation of indexes for var and dosage table (VAR_id;CHROM, POS,REF,ALT). Typically only required if you plan to use gdbConcat to concatenate a series of separately generated gdb files before use
##   skipVarRanges    Flag to skip generation of ranged var table. Typically only required if you plan to use gdbConcat to concatenate a series of separately generated gdb files before use
##   overWrite        Overwrite if output already exists? Defaults to FALSE, in which case an error is raised.
##   genomeBuild      Optional genome build to include in the gdb metadata. If specified, it will be used to set ploidies (diploid, XnonPAR, YnonPAR) if the genome build is implemented in RVAT (currently: GRCh37, hg19, GRCh38, hg38).
##   memlimit         Maximum number of vcf records to parse at a time, defaults to 1000.
##   
## Error:                                                                                                                                                              
## Execution halted

Build GDB

Import VCF:

Rscript $rvat \
--buildGdb \
--vcf ${vcfpath} \
--genomeBuild GRCh38 \
--output $outdir/rvat_tutorials_cmdline.gdb \
--overWrite

Import sample metadata

Rscript $rvat --uploadCohort \
--gdb $outdir/rvat_tutorials_cmdline.gdb \
--name pheno \
--value ${phenopath}

list imported cohorts:

Rscript $rvat --listCohort --gdb $outdir/rvat_tutorials_cmdline.gdb

Import variant annotation tables

Rscript $rvat --uploadAnno \
--gdb $outdir/rvat_tutorials_cmdline.gdb \
--name varInfo \
--value ${varinfopath}

list anno tables:

Rscript $rvat --listAnno --gdb $outdir/rvat_tutorials_cmdline.gdb

Association tests

Build varSets

build varSet containing moderate variants:

Rscript $rvat --buildVarSet \
--gdb $outdir/rvat_tutorials_cmdline.gdb \
--varSetName Moderate \
--unitTable varInfo \
--unitName gene_name \
--where "ModerateImpact = 1" \
--output ${outdir}/moderate_cmdline.txt.gz

build multiple varSets

tables=(HighImpact ModerateImpact CADDphred PolyPhen)
weights=(1 1 CADDphred 1)
wheres=("HighImpact = 1" "ModerateImpact = 1" "CADDphred is not 'NA'" "PolyPhen is 'D'")
n=${#tables[@]}
for ((i=0;i<${n};i++))
   do
      Rscript $rvat --buildVarSet \
      --gdb $outdir/rvat_tutorials_cmdline.gdb \
      --unitTable varInfo \
      --unitName gene_name \
      --varSetName ${tables[$i]} \
      --where "${wheres[$i]}" \
      --weightName ${weights[$i]} \
      --output $outdir/${tables[$i]}_cmdline.txt.gz
   done &> $outdir/varsets.log

Merge variant sets for coordinated set based association testing of multiple filters

varSets=(HighImpact ModerateImpact CADDphred PolyPhen)
(gunzip -c $outdir/${varSets[0]}_cmdline.txt.gz | grep '^#'; for varSet in ${varSets[@]}
  do
      gunzip -c $outdir/${varSet}_cmdline.txt.gz | grep -v '^#'
  done) | sort -t '|' -s -k1,1 | gzip > $outdir/mergedVarsets_cmdline.txt.gz

Run burden tests

Rscript $rvat \
--assocTest \
--gdb $outdir/rvat_tutorials_cmdline.gdb \
--varSet $outdir/mergedVarsets_cmdline.txt.gz \
--name maxMAF001 \
--cohort pheno \
--pheno pheno \
--test firth,skat_robust,acatv \
--output $outdir/burden_tests_merged_cmdline.txt.gz &> $outdir/burden_tests_merged_cmdline.log

Run singlevar tests

Rscript $rvat \
--assocTest \
--singlevar \
--gdb $outdir/rvat_tutorials_cmdline.gdb \
--varSet ${outdir}/moderate_cmdline.txt.gz \
--name maxMAF001 \
--cohort pheno \
--pheno pheno \
--test firth \
--output $outdir/singlevar_tests_merged_cmdline.txt.gz &> $outdir/singlevar_tests_merged_cmdline.log