gdb.Rd
Compressed SQLite (".gdb") representation of genotype data and associated variant annotation and sample info tables These allow for rapid and memory-efficient of loading sample genotype data en associated metadata within R. The slots of the gdb class are inherited entirely from the RSQLite SQLiteConnection class. A host of RVAT methods described here allow for convenient querying and manipulation of a gdb, for complex queries users can also directly perform SQL queries on the gdb as exemplified in the examples and tutorials.
gdb(x)
: Connect to a gdb object (created using buildGdb()
), where x
is the filepath.
In the following code snippets, x is a gdb object.
buildGdb(x)
: build a gdb directly from a vcf, see buildGdb()
for details.
In the following code snippets, x is a gdb object.
getGT(x, ...)
: Retrieve genotype data from the gdb, returns a genoMatrix()
object.
See getGT()
for details.
getAnno(x, ...)
: Get an annotation table from gdb. See the getAnno()
documentation for details.
getCohort(x, ...)
: Get a cohort table from gdb. See the getCohort()
documentation for details.
listCohort(x)
: List sample cohort tables that have been uploaded to the gdb.
listAnno(x)
: List variant info tables that have been uploaded to the gdb.
getGdbId(x)
, getGdbPath(x)
, getCreationDate(x)
, getGenomeBuild(x)
: various methods to retrieve indicate metadata from gdb
In the following code snippets, x is a gdb object.
uploadCohort(x, ...)
: Upload cohort data tables to gdb.
See the uploadCohort()
documentation for details.
uploadAnno(x, ...)
: Upload variant annotation data into gdb.
See the uploadAnno()
documentation for details
mapVariants(x, ...)
: Map variants in the gdb onto features provided in a bed,gff/gtf or ranges file.
See mapVariants()
for details.
dropTable(x, name)
: Drop table with the name specified in name
from gdb and clear from annotation / cohort metadata tables.
subSetGdb
: Create a new gdb that is a subset of the input gdb. See subsetGdb()
for details.
concatGdb
: Function to concatenate gdb databases. See concatGdb()
for details.
writeVcf
: Convert a gdb to a vcf-file. See writeVcf()
for details.
library(rvatData)
# build a gdb directly from a vcf file
vcfpath <- rvat_example("rvatData.vcf.gz") # example vcf from rvatData package
gdbpath <- tempfile() # write gdb to temporary file
gdb <- buildGdb(vcf = vcfpath,
output = gdbpath,
genomeBuild = "GRCh38")
#> 2025-02-12 12:25:34 Creating gdb tables
#> 25000 sample IDs detected
#> 2025-02-12 12:25:34 Parsing vcf records
#> 2025-02-12 12:26:04 Processing completed for 1000 records. Committing to db.
#> 2025-02-12 12:26:26 Processing completed for 1802 records. Committing to db.
#> 2025-02-12 12:26:26 Creating var table indexes
#> 2025-02-12 12:26:26 Creating SM table index
#> 2025-02-12 12:26:26 Creating dosage table index
#> 2025-02-12 12:26:26 Creating ranged var table
#> 2025-02-12 12:26:26 Complete
# upload variant info
varinfo <- rvat_example("rvatData.varinfo") # example variant info from rvatData package
uploadAnno(gdb, name = "varInfo", value = varinfo)
#> Loading table 'varInfo' from '/Users/phop2/Library/R/x86_64/4.4/library/rvatData/extdata/rvatData.varinfo'
#> 17 fields detected (CHROM,POS,ID,REF,ALT,QUAL,FILTER,AC,AN,AF,gene_name,HighImpact,ModerateImpact,Synonymous,CADDphred,PolyPhen,SIFT)
#> [1] 1
# upload cohort info
pheno <- rvat_example("rvatData.pheno") # example cohort info from rvatData package
uploadCohort(gdb, name = "pheno", value = pheno)
#> Loading cohort 'pheno' from '/Users/phop2/Library/R/x86_64/4.4/library/rvatData/extdata/rvatData.pheno'
#> 10 fields detected (IID,sex,pheno,pop,superPop,PC1,PC2,PC3,PC4,age)
#> 12865 males, 12135 females and 0 unknown gender
#> Retained 25000 of 25000 uploaded samples that could be mapped to dosage matrix
#> [1] 1
# list annotations and cohorts present in gdb
listAnno(gdb)
#> name
#> 1 varInfo
#> value
#> 1 /Users/phop2/Library/R/x86_64/4.4/library/rvatData/extdata/rvatData.varinfo
#> date
#> 1 Wed Feb 12 12:26:26 2025
listCohort(gdb)
#> name
#> 1 pheno
#> value
#> 1 /Users/phop2/Library/R/x86_64/4.4/library/rvatData/extdata/rvatData.pheno
#> date
#> 1 Wed Feb 12 12:26:26 2025
# retrieve annotations for a genomic interval
varinfo <- getAnno(gdb,
table = "varinfo",
ranges = data.frame(CHROM = "chr1", start = 11013847, end = 11016874))
head(varinfo)
#> VAR_id CHROM POS ID REF ALT QUAL FILTER AC AN
#> 1 1 chr1 11013912 NA A G 722.13 NA 1 45484
#> 2 2 chr1 11013928 rs755357622 C T 2598.23 NA 1 49194
#> 3 3 chr1 11013936 NA A C 4135.36 NA 1 50000
#> 4 4 chr1 11013936 NA A G 4135.36 NA 1 42636
#> 5 5 chr1 11013952 NA C G 517.13 NA 1 49574
#> 6 6 chr1 11016874 rs80356715 C T 64910.10 NA 32 49766
#> AF gene_name HighImpact ModerateImpact Synonymous CADDphred
#> 1 0.0000219858 TARDBP 0 1 0 24.8
#> 2 0.0000203277 TARDBP 0 0 1 .
#> 3 0.0000200000 TARDBP 0 1 0 24.1
#> 4 0.0000234544 TARDBP 0 1 0 22.6
#> 5 0.0000201719 TARDBP 0 0 1 .
#> 6 0.0006430093 TARDBP 0 1 0 22.2
#> PolyPhen SIFT
#> 1 P D
#> 2 . .
#> 3 B T
#> 4 B T
#> 5 . .
#> 6 B T
# see ?getAnno for more details
# retrieve cohort
pheno <- getCohort(gdb, cohort = "pheno")
head(pheno)
#> IID sex pheno pop superPop PC1 PC2 PC3 PC4
#> 1 ALS1 2 1 PJL SAS 22.58749 4.738025 38.9157564 -10.1667342
#> 2 ALS2 2 1 BEB SAS 26.37370 -1.445903 31.9712989 -6.0222555
#> 3 ALS3 1 1 TSI EUR 30.75496 48.866843 -5.4092404 14.9474404
#> 4 ALS4 2 1 GWD AFR -87.48035 -6.193628 -0.9504131 -1.2029932
#> 5 ALS5 1 1 MSL AFR -87.79310 -5.323925 -1.0353300 -0.7403234
#> 6 ALS6 1 1 YRI AFR -86.64071 -3.418417 -1.3964977 1.0561607
#> age
#> 1 66.01743
#> 2 66.84306
#> 3 54.35749
#> 4 65.09777
#> 5 58.94664
#> 6 63.13635
# see ?getCohort for more details
# delete table
uploadAnno(gdb, name = "varInfo2", value = varinfo)
#> Loading table 'varInfo2' from interactive R session'
#> 18 fields detected (VAR_id,CHROM,POS,ID,REF,ALT,QUAL,FILTER,AC,AN,AF,gene_name,HighImpact,ModerateImpact,Synonymous,CADDphred,PolyPhen,SIFT)
#> [1] 1
dropTable(gdb, "varInfo2")
#> Table 'varInfo2' removed from gdb
# retrieve gdb metadata
getGdbPath(gdb)
#> [1] "/var/folders/cl/wvc0rvjx4vd5rzt2_fhpmfth0000gp/T//RtmpRC6myU/file1825cc33abe9"
getGenomeBuild(gdb)
#> [1] "GRCh38"
# retrieve genotypes
# first extract variant ids that we want to retrieve the genotypes for
# note that `where` here is an SQL compliant
varinfo <- getAnno(gdb,
table = "varinfo",
where = "gene_name = 'SOD1' and ModerateImpact = 1")
# retrieve genotypes for variants extracted above
GT <- getGT(gdb,
VAR_id = varinfo$VAR_id,
cohort = "pheno")
#> Retrieved genotypes for 38 variants
# retrieve genotypes for a given genomic interval
GT_fromranges <- getGT(
gdb,
ranges = data.frame(CHROM = "chr21", start = 31659666, end = 31668931),
cohort = "pheno",
anno = "varInfo",
annoFields = c("VAR_id", "CHROM", "POS", "REF", "ALT", "HighImpact", "ModerateImpact", "Synonymous")
)
#> Retrieved genotypes for 49 variants
# Learn about the getGT method in ?getGT
# Learn about the genoMatrix format in ?genoMatrix
# Learn more about the downstream methods available for the gdb class in the relevant help pages
# e.g. ?mapVariants, ?subsetGdb, ?assocTest, ?writeVcf
close(gdb)