Getting started

Some output will be written during the tutorials, a temporary directory will be used for that:

outdir <- tempdir()
Sys.setenv(rvat = system.file('exec/rvat.R', package = 'rvat'),
           vcfpath = rvatData::rvat_example('rvatData.vcf.gz'),
           phenopath = rvatData::rvat_example('rvatData.pheno'),
           varinfopath = rvatData::rvat_example('rvatData.varinfo'),
           outdir = outdir
           )

Identify RVAT executable path:

#rvat=$(Rscript -e "cat(system.file('exec/rvat.R', package = 'rvat'))")

Main help page:

Rscript $rvat --help
## Rare variant analysis toolkit (rvat)
## 
## Error: Please provide a valid function call.
## 
## Provide a valid function call for detailed help.
## --buildGdb
## --concatGdb
## --uploadAnno
## --mapVariants
## --uploadCohort
## --listAnno
## --listCohort
## --dropTable
## --subsetGdb
## --buildVarSet
## --spatialClust
## --summariseGeno
## --aggregate
## --mergeAggDbs
## --collapseAggDbs
## --vcfInfo2Table
## --writeVcf
## --assocTest
## --buildResamplingFile
## --buildGeneSet
## --buildCorMatrix
## --geneSetAssoc
## Error:                                                                                                                                                              
## Execution halted

Function specific help page:

Rscript $rvat --buildGdb
## 
## buildGdb
## 
##   Creates a new gdb file. 
##   The gdb can be structured and populated using a provided vcf file. 
## 
## Usage:
##   Rscript rvat.R --buildGdb --vcf={vcf} --output={output} [options]
## 
## Arguments:
##   vcf              Input vcf file used to structure and populate gdb. 
##                    Warning this function makes the following assumptions: 
##                      1) strict adherence to vcf format (GT subfield first element in genotype fields),
##                      2) desired genotype QC has already been applied (DP,GQ filters),
##                      3) GT values conform to the set {0/0,0/1,1/0,1/1,./.,0|0,0|1,1|0,1|1,.|.}.
##                      Multiallelic parsing and genotype QC can be performed using vcftools and/or
##                      accompanying parser scripts included in the rvat repository.
##   output           Path for output gdb file.
##   skipIndexes      Flag to skip generation of indexes for var and dosage table (VAR_id;CHROM,POS,REF,ALT).  
##                    Typically only required if you plan to use --concatGdb to concatenate a series of separately generated gdb files.
##   skipVarRanges    Flag to skip generation of ranged var table.
##                    Typically only useful (i.e., faster) if you plan to use --concatGdb to concatenate a series of separately generated gdb files.
##   overWrite        Flag to indicate that output gdb file should be overwritten if it already exists.
##   genomeBuild      Optional genome build to include in the gdb metadata.
##                    If specified, it will be used to set ploidies (diploid, XnonPAR, YnonPAR) if the genome build is implemented in RVAT (currently: GRCh37, hg19, GRCh38, hg38).
##   memlimit         Maximum number of vcf records to parse at a time, defaults to 1000.
##   quiet            Flag to suppress verbose messages.
##   
## Error:                                                                                                                                                              
## Execution halted

Build GDB

Import VCF:

Rscript $rvat \
--buildGdb \
--vcf ${vcfpath} \
--genomeBuild GRCh38 \
--output $outdir/rvat_tutorials_cmdline.gdb \
--overWrite

Import sample metadata

Rscript $rvat --uploadCohort \
--gdb $outdir/rvat_tutorials_cmdline.gdb \
--name pheno \
--value ${phenopath}

list imported cohorts:

Rscript $rvat --listCohort --gdb $outdir/rvat_tutorials_cmdline.gdb

Import variant annotation tables

Rscript $rvat --uploadAnno \
--gdb $outdir/rvat_tutorials_cmdline.gdb \
--name varInfo \
--value ${varinfopath}

list anno tables:

Rscript $rvat --listAnno --gdb $outdir/rvat_tutorials_cmdline.gdb

Association tests

Build varSets

build varSet containing moderate variants:

Rscript $rvat --buildVarSet \
--gdb $outdir/rvat_tutorials_cmdline.gdb \
--varSetName Moderate \
--unitTable varInfo \
--unitName gene_name \
--where "ModerateImpact = 1" \
--output ${outdir}/moderate_cmdline.txt.gz

build multiple varSets

tables=(HighImpact ModerateImpact CADDphred PolyPhen)
weights=(1 1 CADDphred 1)
wheres=("HighImpact = 1" "ModerateImpact = 1" "CADDphred is not 'NA'" "PolyPhen is 'D'")
n=${#tables[@]}
for ((i=0;i<${n};i++))
   do
      Rscript $rvat --buildVarSet \
      --gdb $outdir/rvat_tutorials_cmdline.gdb \
      --unitTable varInfo \
      --unitName gene_name \
      --varSetName ${tables[$i]} \
      --where "${wheres[$i]}" \
      --weightName ${weights[$i]} \
      --output $outdir/${tables[$i]}_cmdline.txt.gz
   done &> $outdir/varsets.log

Merge variant sets for coordinated set based association testing of multiple filters

varSets=(HighImpact ModerateImpact CADDphred PolyPhen)
gunzip -c $outdir/${varSets[1]}_cmdline.txt.gz | grep '^#' > $outdir/mergedVarsets_cmdline.txt
(for varSet in ${varSets[@]}
  do
      gunzip -c $outdir/${varSet}_cmdline.txt.gz | grep -v '^#'
  done) | sort -t '|' -s -k1,1 >> $outdir/mergedVarsets_cmdline.txt
gzip -c $outdir/mergedVarsets_cmdline.txt > $outdir/mergedVarsets_cmdline.txt.gz

Run burden tests

Rscript $rvat \
--assocTest \
--gdb $outdir/rvat_tutorials_cmdline.gdb \
--varSet $outdir/mergedVarsets_cmdline.txt.gz \
--name maxMAF001 \
--cohort pheno \
--pheno pheno \
--test firth,skat_robust,acatv \
--output $outdir/burden_tests_merged_cmdline.txt.gz &> $outdir/burden_tests_merged_cmdline.log

Run singlevar tests

Rscript $rvat \
--assocTest \
--singlevar \
--gdb $outdir/rvat_tutorials_cmdline.gdb \
--varSet ${outdir}/moderate_cmdline.txt.gz \
--name maxMAF001 \
--cohort pheno \
--pheno pheno \
--test firth \
--output $outdir/singlevar_tests_merged_cmdline.txt.gz &> $outdir/singlevar_tests_merged_cmdline.log