# Libraries
library(rvat)
library(rvatData)

Some output will be written during the tutorials, use a temporary directory for that:

outdir <- tempdir()

Getting started

Connect to gdb:

gdb <- gdb(rvat_example("rvatData.gdb"))

Create varSets

For this tutorial, we’ll create a varSet containing variants with a moderate impact (unweighted), and a varSet that contains weights based on CADD scores. (more information on building varSets will be included later)

# Build a varset including variants with a moderate predicted impact
buildVarSet(object = gdb, 
            output = paste0(outdir, "/moderate.txt.gz"),
            varSetName = "Moderate", 
            unitTable = "varInfo", 
            unitName = "gene_name",
            where = "ModerateImpact = 1")

# Build a varset containing CADD weights
buildVarSet(object = gdb, 
            output = paste0(outdir, "/CADD.txt.gz"),
            varSetName = "CADD", 
            unitTable = "varInfo", 
            unitName = "gene_name",
            weightName = "CADDphred")

Initialize a GT object

moderatepath <-  paste0(outdir, "/moderate.txt.gz")
CADDpath <- paste0(outdir, "/CADD.txt.gz")

## Initialize a GT object
varsetfile <- varSetFile(moderatepath)
varset <- getVarSet(varsetfile, unit = "NEK1")
GT <- getGT(
    gdb,
    varSet = varset,
    cohort = "pheno")
## Retrieved genotypes for 106 variants

Run burden tests

Run a simple burden test:
- with ‘pheno’ column as (binary) outcome
- adjusting for 4 PCs
- Using the ‘firth’ method

burden_pheno_4PCs_firth <- assocTest(GT,
                                     pheno = "pheno",
                                     covar = c("PC1", "PC2", "PC3", "PC4"),
                                     test = "firth",
                                     name = "example" # optional name of the analysis
                                     )
burden_pheno_4PCs_firth
## rvbResult with 1 row and 24 columns
##          unit cohort varSetName    name pheno           covar geneticModel
##   <character>  <Rle>      <Rle>   <Rle> <Rle>           <Rle>        <Rle>
## 1        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
##   MAFweight  test      nvar caseCarriers ctrlCarriers meanCaseScore
##       <Rle> <Rle> <numeric>    <numeric>    <numeric>     <numeric>
## 1         1 firth       104         1317         5185      0.305842
##   meanCtrlScore     caseN     ctrlN caseCallRate ctrlCallRate    effect
##       <numeric> <numeric> <numeric>    <numeric>    <numeric> <numeric>
## 1       0.30199      5000     20000     0.965765     0.966087 0.0148229
##    effectSE effectCIlower effectCIupper        OR         P
##   <numeric>     <numeric>     <numeric> <numeric> <numeric>
## 1 0.0301844    -0.0446503      0.073697   1.01493  0.623855

Also include the SKAT (Wu et al. 2011) and ACAT-V (Liu et al. 2019) tests, all available tests are described in assocTest():

burden_pheno_4PCs <- assocTest(GT,
                               pheno = "pheno",
                               covar = c("PC1", "PC2", "PC3", "PC4"),
                               test = c("firth", "skat_robust", "acatv"),
                               name = "example" # optional name of the analysis
                                     )
burden_pheno_4PCs
## rvbResult with 3 rows and 24 columns
##          unit cohort varSetName    name pheno           covar geneticModel
##   <character>  <Rle>      <Rle>   <Rle> <Rle>           <Rle>        <Rle>
## 1        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
## 2        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
## 3        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
##   MAFweight        test      nvar caseCarriers ctrlCarriers meanCaseScore
##       <Rle>       <Rle> <numeric>    <numeric>    <numeric>     <numeric>
## 1         1       firth       104         1317         5185      0.305842
## 2         1 skat_robust       104         1317         5185      0.305842
## 3         1       acatv       104         1317         5185      0.305842
##   meanCtrlScore     caseN     ctrlN caseCallRate ctrlCallRate    effect
##       <numeric> <numeric> <numeric>    <numeric>    <numeric> <numeric>
## 1       0.30199      5000     20000     0.965765     0.966087 0.0148229
## 2       0.30199      5000     20000     0.965765     0.966087        NA
## 3       0.30199      5000     20000     0.965765     0.966087        NA
##    effectSE effectCIlower effectCIupper        OR           P
##   <numeric>     <numeric>     <numeric> <numeric>   <numeric>
## 1 0.0301844    -0.0446503      0.073697   1.01493 6.23855e-01
## 2        NA            NA            NA        NA 7.08149e-02
## 3        NA            NA            NA        NA 1.80267e-08

Run burden tests on a continuous phenotype, indicate that the phenotype is continuous by setting continuous = TRUE.

burden_age_4PCs <- assocTest(GT,
                               pheno = "age",
                               continuous = TRUE,
                               covar = c("PC1", "PC2", "PC3", "PC4"),
                               test = c("lm", "skat", "acatv"),
                               name = "example" # optional name of the analysis
                              )

Run weighted burden test

The assocTest() method applies the weights as included in the genoMatrix() object. For instance, if we load the varSet including the CADD weights, the genoMatrix includes these weights in the w field:

varsetfile <- varSetFile(CADDpath)
varset_CADD <- getVarSet(varsetfile, unit = "NEK1")
GT_CADD <- getGT(
    gdb,
    varSet = varset_CADD,
    cohort = "pheno")
## Retrieved genotypes for 190 variants
head(rowData(GT_CADD)$w)
## [1]   NA   NA   NA   NA 23.7   NA

If we now run a burden test using this genoMatrix, the weights will be applied (note that variants with missing weights are excluded):

burden_pheno_4PCs_firth_CADD <- assocTest(GT_CADD,
                                     pheno = "pheno",
                                     covar = c("PC1", "PC2", "PC3", "PC4"),
                                     test = "firth",
                                     name = "example" # optional name of the analysis
                                     )
burden_pheno_4PCs_firth_CADD
## rvbResult with 1 row and 24 columns
##          unit cohort varSetName    name pheno           covar geneticModel
##   <character>  <Rle>      <Rle>   <Rle> <Rle>           <Rle>        <Rle>
## 1        NEK1  pheno       CADD example pheno PC1,PC2,PC3,PC4      allelic
##   MAFweight  test      nvar caseCarriers ctrlCarriers meanCaseScore
##       <Rle> <Rle> <numeric>    <numeric>    <numeric>     <numeric>
## 1         1 firth       117         1330         5199        4.7704
##   meanCtrlScore     caseN     ctrlN caseCallRate ctrlCallRate     effect
##       <numeric> <numeric> <numeric>    <numeric>    <numeric>  <numeric>
## 1       4.45753      5000     20000     0.966291     0.966487 0.00476821
##     effectSE effectCIlower effectCIupper        OR         P
##    <numeric>     <numeric>     <numeric> <numeric> <numeric>
## 1 0.00191882   0.000985525    0.00850913   1.00478  0.013629

Run single variant tests

Running single variant tests is very similar to running burden tests. By setting the singlevar parameter to TRUE an association test is performed for each individual variant in the genotype matrix.

singlevar_pheno_4PCs_firth <- assocTest(GT,
                                     pheno = "pheno",
                                     singlevar = TRUE,
                                     covar = c("PC1", "PC2", "PC3", "PC4"),
                                     test = "firth",
                                     name = "example",
                                     minCarriers = 3
                                     )

Looping through varSets

The examples above show how genotypes are loaded into memory (in a genoMatrix()) object, and assocTest() is run on the genoMatrix. However, this step can be skipped, and assocTest() can be ran directly on a gdb() connection instead. This allows looping through the varsets included in a varSetList() or varSetFile() as demonstrated below.

First, we check what units are available in the varsetfile:

varsetfile <- varSetFile(moderatepath)
head(listUnits(varsetfile))
## [1] "ABCA4"   "CYP19A1" "FUS"     "IL3RA"   "NEK1"    "OPTN"

We’ll subset the first four for this example:

varsetlist <- getVarSet(varsetfile,
                        unit = listUnits(varsetfile)[1:4]
                        )
head(varsetlist,2)
## varSetList
## Contains 2 records
## [[1]]
## unit=ABCA4
## varSetName=Moderate
## VAR_id=29,31,32,34,35,36,39,40,41,42,43,45,46,47,48,49,51,52,53,55,57,58,59,60,61,65,67,68,69,70,71,72,73,74,75,76,83,87,89,91,93,94,95,96,97,98,99,100,102,103,104,105,106,107,109,111,114,115,116,117,118,122,123,128,129,130,131,134,135,136,137,138,139,140,141,142,143,144,145,146,148,149,150,151,152,153,156,157,158,159,160,161,162,168,170,172,174,175,176,177,178,179,184,185,186,188,189,190,191,192,193,194,199,200,201,203,204,205,206,208,209,210,211,212,215,216,217,219,221,222,223,224,225,226,227,228,229,230,232,235,236,237,240,243,244,246,248,249,252,253,255,256,258,260,262,263,268,270,272,274,276,277,278,279,281,282,283,284,285,287,289,290,293,294,295,296,297,298,299,300,303,304,305,308,309,310,312,313,314,316,318,320,321,322,323,325,326,327,328,331,333,336,337,342,343,344,345,347,348,350,352,354,355,356,357,358,359,360,361,363,364,367,368,369,370,371,373,374,375,377,380,381,382,386,388,389,390,391,394,398,399,400,401,404,405,406,407,408,412,413,417,419,420,421,423,424,425,427,429,430,432,433,434,435,436,437,438,439,441,442,443,446,447,449,450,453,454,456,459,460,461,462,463,464,466,467,468,469,470,471,475,476,477,478,480,481,482,485,489,490,491,492,493,495,496,497,498,499,501,503,504,505,506,509,510,512,514,516,518,522,525,526,529,530,533,534,535,537,539,541,543,544,545,546,547,548,550,551,552,553,554,556,557,558,559,562,563,564,566,567,568,570,572,573,577,578,580,581,583,584,585,587,589,590,591,592,593,594,595,596,598,601,603,604,605,607,608,609,611,613,614,616
## w=1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
## 
## [[2]]
## unit=CYP19A1
## varSetName=Moderate
## VAR_id=1085,1086,1089,1091,1092,1093,1095,1096,1097,1098,1100,1101,1104,1105,1106,1107,1108,1111,1112,1113,1115,1116,1117,1118,1119,1120,1121,1123,1124,1127,1129,1130,1132,1133,1134,1135,1136,1138,1139,1140,1141,1142,1143,1144,1145,1146,1147,1149,1150,1151,1154,1155,1157,1158,1159,1160,1164,1166,1167,1168,1169,1170,1171
## w=1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1

Perform association tests on selected varSets:

burden_varsetlist <- assocTest(gdb,
                               varSet = varsetlist,
                               cohort="pheno",
                               pheno = "pheno",
                               covar = c("PC1", "PC2", "PC3", "PC4"),
                               test = "firth",
                               name = "example")
burden_varsetlist
## rvbResult with 4 rows and 24 columns
##          unit cohort varSetName    name pheno           covar geneticModel
##   <character>  <Rle>      <Rle>   <Rle> <Rle>           <Rle>        <Rle>
## 1       ABCA4  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
## 2     CYP19A1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
## 3         FUS  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
## 4       IL3RA  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
##   MAFweight  test      nvar caseCarriers ctrlCarriers meanCaseScore
##       <Rle> <Rle> <numeric>    <numeric>    <numeric>     <numeric>
## 1         1 firth       375         3628        14644    1.21245148
## 2         1 firth        61          613         2445    0.13606823
## 3         1 firth        55           37           82    0.00753858
## 4         1 firth        97         1659         6308    0.39475786
##   meanCtrlScore     caseN     ctrlN caseCallRate ctrlCallRate     effect
##       <numeric> <numeric> <numeric>    <numeric>    <numeric>  <numeric>
## 1    1.24344918      5000     20000     0.962029     0.962052 -0.0308901
## 2    0.13481020      5000     20000     0.970590     0.970493  0.0110277
## 3    0.00424068      5000     20000     0.964785     0.964484  0.6044741
## 4    0.37778493      5000     20000     0.959489     0.958779  0.0575539
##    effectSE effectCIlower effectCIupper        OR          P
##   <numeric>     <numeric>     <numeric> <numeric>  <numeric>
## 1 0.0158922  -0.062117154   0.000190658  0.969582 0.05142525
## 2 0.0444870  -0.076903653   0.097538257  1.011089 0.80444367
## 3 0.1976478   0.206548488   0.983830045  1.830289 0.00337278
## 4 0.0288663   0.000768307   0.113943197  1.059242 0.04699061

Applying sample / variant filters

The following parameters can be used to filter samples based on callrate:
minCallrateSM and maxCallrateSM.

The following parameters can be used to filter variants based on callrate:
minCallrateVar, maxCallrateVar.

The following parameters can be used to filter variants based on allele/carrier frequency:
minMAF, maxMAF, minMAC, maxMAC, minCarriers and maxCarriers, minCarrierFreq and maxCarrierfreq.

Filter variants based on a maximum MAF of 0.0001 and minimum callrate of 0.9:

burden_pheno_4PCs_MAF001_varcr0.9 <- assocTest(GT,
                               pheno = "pheno",
                               covar = c("PC1", "PC2", "PC3", "PC4"),
                               test = c("firth", "skat_robust", "acatv"),
                               name = "example", # optional name of the analysis
                               maxMAF = 0.001,
                               minCallrateVar = 0.9)
## Keeping 25000/25000 available samples for analysis.
## 96/106 variants are retained for analysis
## 1/96 variants have zero carriers, these are dropped.

Additionally, filter samples based on a call-rate filter of > 0.95 :

burden_pheno_4PCs_MAF001_SMcr0.9 <- assocTest(GT,
                               pheno = "pheno",
                               covar = c("PC1", "PC2", "PC3", "PC4"),
                               test = c("firth", "skat_robust", "acatv"),
                               name = "example", # optional name of the analysis
                               maxMAF = 0.001,
                               minCallrateVar = 0.9,
                               minCallrateSM = 0.95
                                     )
## 4155 samples don't pass callRate filters.
## Keeping 20845/25000 available samples for analysis.
## 101/106 variants are retained for analysis
## 11/101 variants have zero carriers, these are dropped.

Applying weights based on minor allele frequency

Currently Madsen-Browning weighting is implemented (Madsen and Browning 2009), and can be applied by setting the MAFweights parameter:

burden_pheno_4PCs_mb <- assocTest(GT,
                               pheno = "pheno",
                               covar = c("PC1", "PC2", "PC3", "PC4"),
                               test = c("firth", "skat_robust", "acatv"),
                               MAFweights = "mb"
                               )

Alternative genetic models

By default the ‘allelic’ genetic model is applied (with 0,1,2 being the number of minor alleles). Alternative models (‘recessive’ and ‘dominant’) can be specified using the geneticModel parameter:

burden_pheno_4PCs_recessive <- assocTest(GT,
                               pheno = "pheno",
                               covar = c("PC1", "PC2", "PC3", "PC4"),
                               test = c("firth", "skat_robust", "acatv"),
                               geneticModel = "recessive")

burden_pheno_4PCs_dominant <- assocTest(GT,
                               pheno = "pheno",
                               covar = c("PC1", "PC2", "PC3", "PC4"),
                               test = c("firth", "skat_robust", "acatv"),
                               geneticModel = "dominant")

Note that the minCarrier, maxCarrier, minCarrierFreq and maxCarrierFreq parameters can be used to filter variants based on the number of carriers after applying the genetic model:

burden_pheno_4PCs_dominant_minCarrierFreq0.01 <- assocTest(GT,
                               pheno = "pheno",
                               covar = c("PC1", "PC2", "PC3", "PC4"),
                               test = c("firth", "skat_robust", "acatv"),
                               geneticModel = "dominant",
                               minCarrierFreq = 0.01)

Note on imputation

By default mean imputation is applied to missing values when performing burden tests (note that by default no imputation is applied for single variant tests), alternatively missing calls can be set to the reference allele:

burden_pheno_4PCs_missingToRef <- assocTest(GT,
                               pheno = "pheno",
                               covar = c("PC1", "PC2", "PC3", "PC4"),
                               test = c("firth", "skat_robust", "acatv"),
                               imputeMethod = "missingToRef")

Permutation P-values

Resampling is implemented for various tests (see the assocTest() help page for details). Simply set methodResampling to ‘permutation’ and specify the number of resamplings using the nResampling parameter:

burden_pheno_4PCs_permuted <- assocTest(GT,
                                     pheno = "pheno",
                                     covar = c("PC1", "PC2", "PC3", "PC4"),
                                     test = c("skat", "skat_burden", "acatv"),
                                     name = "example", # optional name of the analysis,
                                     methodResampling = "permutation",
                                     nResampling = 100
                                     )
burden_pheno_4PCs_permuted
## rvbResult with 6 rows and 24 columns
##          unit cohort varSetName    name pheno           covar geneticModel
##   <character>  <Rle>      <Rle>   <Rle> <Rle>           <Rle>        <Rle>
## 1        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
## 2        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
## 3        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
## 4        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
## 5        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
## 6        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
##   MAFweight                  test      nvar caseCarriers ctrlCarriers
##       <Rle>                 <Rle> <numeric>    <numeric>    <numeric>
## 1         1                  skat       104         1317         5185
## 2         1           skat_burden       104         1317         5185
## 3         1                 acatv       104         1317         5185
## 4         1        skat_resampled       104         1317         5185
## 5         1 skat_burden_resampled       104         1317         5185
## 6         1       acatv_resampled       104         1317         5185
##   meanCaseScore meanCtrlScore     caseN     ctrlN caseCallRate ctrlCallRate
##       <numeric>     <numeric> <numeric> <numeric>    <numeric>    <numeric>
## 1      0.305842       0.30199      5000     20000     0.965765     0.966087
## 2      0.305842       0.30199      5000     20000     0.965765     0.966087
## 3      0.305842       0.30199      5000     20000     0.965765     0.966087
## 4      0.305842       0.30199      5000     20000     0.965765     0.966087
## 5      0.305842       0.30199      5000     20000     0.965765     0.966087
## 6      0.305842       0.30199      5000     20000     0.965765     0.966087
##      effect  effectSE effectCIlower effectCIupper        OR           P
##   <numeric> <numeric>     <numeric>     <numeric> <numeric>   <numeric>
## 1        NA        NA            NA            NA        NA 7.04441e-02
## 2        NA        NA            NA            NA        NA 6.28886e-01
## 3        NA        NA            NA            NA        NA 1.80267e-08
## 4        NA        NA            NA            NA        NA 7.92079e-02
## 5        NA        NA            NA            NA        NA 6.03960e-01
## 6        NA        NA            NA            NA        NA 9.90099e-03

Make sure to set a seed to ensure that the results are reproducible:

# reproducibility 
set.seed(10)
burden_pheno_4PCs_permuted1 <- assocTest(GT,
                                     pheno = "pheno",
                                     covar = c("PC1", "PC2", "PC3", "PC4"),
                                     test = c("skat", "skat_burden", "acatv"),
                                     name = "example", # optional name of the analysis,
                                     methodResampling = "permutation",
                                     nResampling = 100
                                     )

set.seed(10)
burden_pheno_4PCs_permuted2 <- assocTest(GT,
                                     pheno = "pheno",
                                     covar = c("PC1", "PC2", "PC3", "PC4"),
                                     test = c("skat", "skat_burden", "acatv"),
                                     name = "example", # optional name of the analysis,
                                     methodResampling = "permutation",
                                     nResampling = 100
                                     )
waldo::compare(burden_pheno_4PCs_permuted1, burden_pheno_4PCs_permuted2)
## `old@metadata$creationDate`: "2025-02-12 12:32:41"
## `new@metadata$creationDate`: "2025-02-12 12:32:54"

Alternatively, a resamplingmatrix can be build using buildResamplingFile() and supplied to the resamplingMatrix parameter.

# reproducibility 
set.seed(10)
resamplingmatrix <- buildResamplingFile(nSamples = ncol(GT), nResampling = 100)
burden_pheno_4PCs_permuted <- assocTest(GT,
                                     pheno = "pheno",
                                     covar = c("PC1", "PC2", "PC3", "PC4"),
                                     test = c("skat", "skat_burden", "acatv"),
                                     name = "example", # optional name of the analysis,
                                     methodResampling = "permutation",
                                     resamplingMatrix = resamplingmatrix)
burden_pheno_4PCs_permuted
## rvbResult with 6 rows and 24 columns
##          unit cohort varSetName    name pheno           covar geneticModel
##   <character>  <Rle>      <Rle>   <Rle> <Rle>           <Rle>        <Rle>
## 1        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
## 2        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
## 3        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
## 4        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
## 5        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
## 6        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
##   MAFweight                  test      nvar caseCarriers ctrlCarriers
##       <Rle>                 <Rle> <numeric>    <numeric>    <numeric>
## 1         1                  skat       104         1317         5185
## 2         1           skat_burden       104         1317         5185
## 3         1                 acatv       104         1317         5185
## 4         1        skat_resampled       104         1317         5185
## 5         1 skat_burden_resampled       104         1317         5185
## 6         1       acatv_resampled       104         1317         5185
##   meanCaseScore meanCtrlScore     caseN     ctrlN caseCallRate ctrlCallRate
##       <numeric>     <numeric> <numeric> <numeric>    <numeric>    <numeric>
## 1      0.305842       0.30199      5000     20000     0.965765     0.966087
## 2      0.305842       0.30199      5000     20000     0.965765     0.966087
## 3      0.305842       0.30199      5000     20000     0.965765     0.966087
## 4      0.305842       0.30199      5000     20000     0.965765     0.966087
## 5      0.305842       0.30199      5000     20000     0.965765     0.966087
## 6      0.305842       0.30199      5000     20000     0.965765     0.966087
##      effect  effectSE effectCIlower effectCIupper        OR           P
##   <numeric> <numeric>     <numeric>     <numeric> <numeric>   <numeric>
## 1        NA        NA            NA            NA        NA 7.04441e-02
## 2        NA        NA            NA            NA        NA 6.28886e-01
## 3        NA        NA            NA            NA        NA 1.80267e-08
## 4        NA        NA            NA            NA        NA 7.92079e-02
## 5        NA        NA            NA            NA        NA 5.54455e-01
## 6        NA        NA            NA            NA        NA 9.90099e-03
waldo::compare(burden_pheno_4PCs_permuted, burden_pheno_4PCs_permuted1)
## `old@metadata$creationDate`: "2025-02-12 12:33:09"
## `new@metadata$creationDate`: "2025-02-12 12:32:41"

Instead of calculating resampled P-values, the individual resampled P-values can be obtained by setting outputResampling=TRUE.

burden_pheno_4PCs_permutations <- assocTest(GT,
                                     pheno = "pheno",
                                     covar = c("PC1", "PC2", "PC3", "PC4"),
                                     test = c("skat", "skat_burden", "acatv"),
                                     name = "example", # optional name of the analysis,
                                     methodResampling = "permutation",
                                     nResampling = 100,
                                     outputResampling=TRUE
                                     )
burden_pheno_4PCs_permutations
## DataFrame with 300 rows and 10 columns
##     varSetName cohort    name  unit           covar geneticModel MAFweight
##          <Rle>  <Rle>   <Rle> <Rle>           <Rle>        <Rle>     <Rle>
## 1     Moderate  pheno example  NEK1 PC1,PC2,PC3,PC4      allelic         1
## 2     Moderate  pheno example  NEK1 PC1,PC2,PC3,PC4      allelic         1
## 3     Moderate  pheno example  NEK1 PC1,PC2,PC3,PC4      allelic         1
## 4     Moderate  pheno example  NEK1 PC1,PC2,PC3,PC4      allelic         1
## 5     Moderate  pheno example  NEK1 PC1,PC2,PC3,PC4      allelic         1
## ...        ...    ...     ...   ...             ...          ...       ...
## 296   Moderate  pheno example  NEK1 PC1,PC2,PC3,PC4      allelic         1
## 297   Moderate  pheno example  NEK1 PC1,PC2,PC3,PC4      allelic         1
## 298   Moderate  pheno example  NEK1 PC1,PC2,PC3,PC4      allelic         1
## 299   Moderate  pheno example  NEK1 PC1,PC2,PC3,PC4      allelic         1
## 300   Moderate  pheno example  NEK1 PC1,PC2,PC3,PC4      allelic         1
##           pheno        test         P
##     <character> <character> <numeric>
## 1         perm1       acatv  0.284379
## 2         perm2       acatv  0.766812
## 3         perm3       acatv  0.386063
## 4         perm4       acatv  0.953518
## 5         perm5       acatv  0.703487
## ...         ...         ...       ...
## 296      perm96 skat_burden 0.7817500
## 297      perm97 skat_burden 0.0895937
## 298      perm98 skat_burden 0.2933150
## 299      perm99 skat_burden 0.0772219
## 300     perm100 skat_burden 0.9707966

ACAT: combining P-values

ACAT is a fast and powerful P-value combination method (Liu et al. 2019), boosting power by combining the different strengths of various statistical tests, annotations and weighting schemes. RVAT includes a flexible implementation of ACAT allowing the combination of P-values across any given analysis, for example across various statistical tests, weighting schemes, variant sets, covariate sets, MAF filters etc.

First, we’ll use ACAT to combine P-values across three statistical tests (firth, SKAT, and ACAT-V):

burden_results <- assocTest(GT,
                            pheno = "pheno",
                            covar = c("PC1", "PC2", "PC3", "PC4"),
                            test = c("firth", "skat", "acatv"),
                            name = "example")
burden_results
## rvbResult with 3 rows and 24 columns
##          unit cohort varSetName    name pheno           covar geneticModel
##   <character>  <Rle>      <Rle>   <Rle> <Rle>           <Rle>        <Rle>
## 1        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
## 2        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
## 3        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
##   MAFweight  test      nvar caseCarriers ctrlCarriers meanCaseScore
##       <Rle> <Rle> <numeric>    <numeric>    <numeric>     <numeric>
## 1         1 firth       104         1317         5185      0.305842
## 2         1  skat       104         1317         5185      0.305842
## 3         1 acatv       104         1317         5185      0.305842
##   meanCtrlScore     caseN     ctrlN caseCallRate ctrlCallRate    effect
##       <numeric> <numeric> <numeric>    <numeric>    <numeric> <numeric>
## 1       0.30199      5000     20000     0.965765     0.966087 0.0148229
## 2       0.30199      5000     20000     0.965765     0.966087        NA
## 3       0.30199      5000     20000     0.965765     0.966087        NA
##    effectSE effectCIlower effectCIupper        OR           P
##   <numeric>     <numeric>     <numeric> <numeric>   <numeric>
## 1 0.0301844    -0.0446503      0.073697   1.01493 6.23855e-01
## 2        NA            NA            NA        NA 7.04441e-02
## 3        NA            NA            NA        NA 1.80267e-08
ACAT(burden_results, aggregate = "test")
## rvbResult with 1 row and 24 columns
##          unit cohort varSetName    name pheno           covar geneticModel
##   <character>  <Rle>      <Rle>   <Rle> <Rle>           <Rle>        <Rle>
## 1        NEK1  pheno   Moderate example pheno PC1,PC2,PC3,PC4      allelic
##   MAFweight  test      nvar caseCarriers ctrlCarriers meanCaseScore
##       <Rle> <Rle> <numeric>    <numeric>    <numeric>     <numeric>
## 1         1  ACAT       104         1317         5185      0.305842
##   meanCtrlScore     caseN     ctrlN caseCallRate ctrlCallRate    effect
##       <numeric> <numeric> <numeric>    <numeric>    <numeric> <numeric>
## 1       0.30199      5000     20000     0.965765     0.966087        NA
##    effectSE effectCIlower effectCIupper        OR           P
##   <numeric>     <numeric>     <numeric> <numeric>   <numeric>
## 1        NA            NA            NA        NA 5.40802e-08

ACAT() also allows for stepwise ACAT, i.e. you might want to perform two rounds of ACAT: 1) ACAT across statistical tests followed by 2) ACAT across varSets. Step-wise ACAT is performed by supplying a list of the variables (in order), for example, below we first ACAT across statistical tests, followed by ACAT across varSets:

varsetfile <- varSetFile(CADDpath)
varset_CADD <- getVarSet(varsetfile, unit = "NEK1")
GT_CADD <- getGT(
    gdb,
    varSet = varset_CADD,
    cohort = "pheno")
burden_results_CADD <- assocTest(GT_CADD,
                            pheno = "pheno",
                            covar = c("PC1", "PC2", "PC3", "PC4"),
                            test = c("firth", "skat", "acatv"),
                            name = "example")

# Combine the moderate and CADD results:
burden_results_combined <- rbind(burden_results, burden_results_CADD)

# ACAT:
ACAT(burden_results_combined, aggregate = list("test", "varSetName"))
## rvbResult with 1 row and 24 columns
##          unit cohort varSetName    name pheno           covar geneticModel
##   <character>  <Rle>      <Rle>   <Rle> <Rle>           <Rle>        <Rle>
## 1        NEK1  pheno       ACAT example pheno PC1,PC2,PC3,PC4      allelic
##   MAFweight  test      nvar caseCarriers ctrlCarriers meanCaseScore
##       <Rle> <Rle> <numeric>    <numeric>    <numeric>     <numeric>
## 1         1  ACAT        NA           NA           NA            NA
##   meanCtrlScore     caseN     ctrlN caseCallRate ctrlCallRate    effect
##       <numeric> <numeric> <numeric>    <numeric>    <numeric> <numeric>
## 1            NA      5000     20000           NA           NA        NA
##    effectSE effectCIlower effectCIupper        OR           P
##   <numeric>     <numeric>     <numeric> <numeric>   <numeric>
## 1        NA            NA            NA        NA 2.37045e-08

(todo: add a note on P-values that are exactly 0 or 1; add a note on grouping variables)

Command-line

assocTest can also be ran from the command-line using the rvat command-line tool: (for more information see the tutorial on using rvat in the cmd-line: vignette("cmdline")). If a varset contains multiple records, the assocTest method will loop through each record:

Sys.setenv(rvat = system.file('exec/rvat.R', package = 'rvat'),
           gdbpath = rvatData::rvat_example('rvatData.gdb'),
           outdir = outdir
           )
# identify executable path:
varset=$outdir/moderate.txt.gz

Rscript $rvat --assocTest \
--gdb $gdbpath \
--varSet $varset \
--test firth,acatv,skat_robust \
--geneticModel allelic \
--cohort pheno \
--pheno pheno \
--covar PC1,PC2,PC3,PC4 \
--output $outdir/burden_cmdline1.txt.gz

Multiple phenotypes, tests, geneticModels, MAFweights and covars can be specified, for example, apply both allelic and recessive models and run for two covar sets:

varset=$outdir/moderate.txt.gz

Rscript $rvat --assocTest \
--gdb $gdbpath \
--varSet $varset \
--test firth,acatv,skat_robust \
--geneticModel allelic,recessive \
--cohort pheno \
--pheno pheno \
--covar PC1,PC2,PC3,PC4/PC1,PC2,PC3,PC4,age \
--output $outdir/burden_cmdline2.txt.gz

References

Liu, Yaowu, Sixing Chen, Zilin Li, Alanna C. Morrison, Eric Boerwinkle, and Xihong Lin. 2019. ACAT: A Fast and Powerful p Value Combination Method for Rare-Variant Analysis in Sequencing Studies.” The American Journal of Human Genetics 104 (3): 410–21. https://doi.org/10.1016/j.ajhg.2019.01.002.
Madsen, Bo Eskerod, and Sharon R. Browning. 2009. “A Groupwise Association Test for Rare Mutations Using a Weighted Sum Statistic.” Edited by Nicholas J. Schork. PLoS Genetics 5 (2): e1000384. https://doi.org/10.1371/journal.pgen.1000384.
Wu, Michael C., Seunggeun Lee, Tianxi Cai, Yun Li, Michael Boehnke, and Xihong Lin. 2011. “Rare-Variant Association Testing for Sequencing Data with the Sequence Kernel Association Test.” The American Journal of Human Genetics 89 (82-93): 82–93. https://doi.org/10.1016/j.ajhg.2011.05.029.