## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup, echo = FALSE------------------------------------------------------
library(BinaryDosage)

## ----vcf_files----------------------------------------------------------------
vcf1afile <- system.file("extdata", "set1a.vcf", package = "BinaryDosage")
vcf1ainfo <- system.file("extdata", "set1a.info", package = "BinaryDosage")
vcf1bfile <- system.file("extdata", "set1b.vcf", package = "BinaryDosage")
vcf1binfo <- system.file("extdata", "set1b.info", package = "BinaryDosage")

## ----vcf_convert, message = FALSE, warning = FALSE----------------------------
bdfile1a <- tempfile()
bdfile1b <- tempfile()

# Without information file
vcftobdlegacy(vcffiles = vcf1afile, bdfiles = bdfile1a)

# With information file — embeds aaf, maf, avgcall, rsq in the header
vcftobdlegacy(vcffiles = c(vcf1bfile, vcf1binfo), bdfiles = bdfile1b)

## ----vcf_gz, message = FALSE, warning = FALSE---------------------------------
vcf1agzfile <- system.file("extdata", "set1a.vcf.gz", package = "BinaryDosage")
bdfile1a_gz <- tempfile()
vcftobdlegacy(vcffiles = vcf1agzfile, bdfiles = bdfile1a_gz, gz = TRUE)

## ----vcf_dosageonly, message = FALSE, warning = FALSE-------------------------
vcf2afile <- system.file("extdata", "set2a.vcf", package = "BinaryDosage")
bdfile2a <- tempfile()
vcftobdlegacy(vcffiles = vcf2afile, bdfiles = bdfile2a)

## ----snpidformat, message = FALSE, warning = FALSE----------------------------
vcf1brsfile <- system.file("extdata", "set1b_rssnp.vcf", package = "BinaryDosage")
bdfile_id0  <- tempfile()
bdfile_id1  <- tempfile()
bdfile_id2  <- tempfile()
bdfile_idm1 <- tempfile()

vcftobdlegacy(vcffiles = vcf1brsfile, bdfiles = bdfile_id0)
vcftobdlegacy(vcffiles = vcf1brsfile, bdfiles = bdfile_id1, snpidformat = 1L)
vcftobdlegacy(vcffiles = vcf1brsfile, bdfiles = bdfile_id2, snpidformat = 2L)
vcftobdlegacy(vcffiles = vcf1brsfile, bdfiles = bdfile_idm1, snpidformat = -1L)

snpnames <- data.frame(
  format0  = getbdinfo(bdfile_id0)$snps$snpid,
  format1  = getbdinfo(bdfile_id1)$snps$snpid,
  format2  = getbdinfo(bdfile_id2)$snps$snpid,
  formatm1 = getbdinfo(bdfile_idm1)$snps$snpid
)
knitr::kable(snpnames, caption = "SNP IDs by snpidformat")

## ----bdoptions, message = FALSE, warning = FALSE------------------------------
bdfile1a_calc <- tempfile()
vcftobdlegacy(vcffiles = vcf1afile, bdfiles = bdfile1a_calc,
              bdoptions = c("aaf", "maf", "rsq"))
bdcalcinfo <- getbdinfo(bdfile1a_calc)
knitr::kable(data.frame(bdcalcinfo$snpinfo),
             caption = "Calculated per-SNP statistics", digits = 3)

## ----gen_files----------------------------------------------------------------
gen3afile   <- system.file("extdata", "set3a.imp",    package = "BinaryDosage")
gen3asample <- system.file("extdata", "set3a.sample", package = "BinaryDosage")
gen3bfile   <- system.file("extdata", "set3b.imp",    package = "BinaryDosage")
gen3bsample <- system.file("extdata", "set3b.sample", package = "BinaryDosage")

## ----gen_convert, message = FALSE, warning = FALSE----------------------------
bdfile3a <- tempfile()
bdfile3b <- tempfile()

gentobd(genfiles = c(gen3afile, gen3asample),
        snpcolumns = c(0L, 2L:5L),
        bdfiles = bdfile3a)
gentobd(genfiles = c(gen3bfile, gen3bsample),
        snpcolumns = c(0L, 2L:5L),
        bdfiles = bdfile3b)

## ----gen_snpcolumns, message = FALSE, warning = FALSE-------------------------
gen3bchrfile <- system.file("extdata", "set3b.chr.imp", package = "BinaryDosage")
bdfile3b_chr <- tempfile()
# chromosome is in column 1, SNP ID in column 2, location in column 3, etc.
gentobd(genfiles = c(gen3bchrfile, gen3bsample), bdfiles = bdfile3b_chr)

## ----gen_impformat, message = FALSE, warning = FALSE--------------------------
gen2bfile   <- system.file("extdata", "set2b.imp",    package = "BinaryDosage")
sample2bfile <- system.file("extdata", "set2b.sample", package = "BinaryDosage")
bdfile2b <- tempfile()
gentobd(genfiles = c(gen2bfile, sample2bfile),
        snpcolumns = c(1L, 3L, 2L, 4L, 5L),
        impformat = 1L,
        bdfiles = bdfile2b)

## ----gen_gz, message = FALSE, warning = FALSE---------------------------------
gen4bgzfile  <- system.file("extdata", "set4b.imp.gz", package = "BinaryDosage")
sample4bfile <- system.file("extdata", "set4b.sample", package = "BinaryDosage")
bdfile4b_gz  <- tempfile()
gentobd(genfiles = c(gen4bgzfile, sample4bfile),
        snpcolumns = c(1L, 2L, 4L, 5L, 6L),
        startcolumn = 7L,
        impformat = 2L,
        gz = TRUE,
        bdfiles = bdfile4b_gz)

## ----merge, message = FALSE, warning = FALSE----------------------------------
bd1afile <- system.file("extdata", "vcf1a.bdose", package = "BinaryDosage")
bd1bfile <- system.file("extdata", "vcf1b.bdose", package = "BinaryDosage")
bd1merged <- tempfile()

bdmerge(mergefiles = bd1merged, bdfiles = c(bd1afile, bd1bfile))

bd1ainfo   <- getbdinfo(bd1afile)
bd1binfo   <- getbdinfo(bd1bfile)
bd1merinfo <- getbdinfo(bd1merged)

cat("Set 1a subjects:", nrow(bd1ainfo$samples), "\n")
cat("Set 1b subjects:", nrow(bd1binfo$samples), "\n")
cat("Merged subjects:", nrow(bd1merinfo$samples), "\n")

## ----getbdinfo, message = FALSE, warning = FALSE------------------------------
bd1ainfo <- getbdinfo(bdfiles = bd1afile)

## ----bdapply, message = FALSE, warning = FALSE--------------------------------
calculateaaf <- function(dosage, p0, p1, p2) {
  mean(dosage, na.rm = TRUE) / 2
}

bd1ainfo <- getbdinfo(bd1afile)
aaf_vals <- unlist(bdapply(bdinfo = bd1ainfo, func = calculateaaf))

aaf_table <- data.frame(snpid = bd1ainfo$snps$snpid, aaf = aaf_vals)
knitr::kable(aaf_table, caption = "Alternate allele frequencies", digits = 4)

## ----bdapply_getaaf, message = FALSE, warning = FALSE-------------------------
aaf_vals2 <- unlist(bdapply(bdinfo = bd1ainfo, func = getaaf))

## ----getsnp, message = FALSE, warning = FALSE---------------------------------
snp_data <- data.frame(getsnp(bdinfo = bd1ainfo, snp = "1:12000:T:C", dosageonly = FALSE))
snp_table <- cbind(sid = bd1ainfo$samples$sid, snp_data)
knitr::kable(head(snp_table, 10), caption = "SNP 1:12000:T:C (first 10 subjects)", digits = 4)

## ----vcfinfo, message = FALSE, warning = FALSE--------------------------------
vcfinfo <- getvcfinfo(vcffiles = vcf1afile, index = TRUE)
aaf_vcf  <- unlist(vcfapply(vcfinfo = vcfinfo, func = getaaf))

aaf_compare <- data.frame(snpid = vcfinfo$snps$snpid, aaf = aaf_vcf)
knitr::kable(aaf_compare, caption = "AAF from VCF via vcfapply", digits = 4)

## ----geninfo, message = FALSE, warning = FALSE--------------------------------
geninfo <- getgeninfo(genfiles = c(gen3bchrfile, gen3bsample), index = TRUE)
aaf_gen  <- unlist(genapply(geninfo = geninfo, func = getaaf))

aaf_gen_table <- data.frame(snpid = geninfo$snps$snpid, aaf = aaf_gen)
knitr::kable(aaf_gen_table, caption = "AAF from GEN via genapply", digits = 4)

## ----full_workflow, message = FALSE, warning = FALSE--------------------------
library(BinaryDosage)

# --- Input files ---
vcf1afile   <- system.file("extdata", "set1a.vcf",    package = "BinaryDosage")
vcf1ainfo   <- system.file("extdata", "set1a.info",   package = "BinaryDosage")
vcf1bfile   <- system.file("extdata", "set1b.vcf",    package = "BinaryDosage")
vcf1binfo   <- system.file("extdata", "set1b.info",   package = "BinaryDosage")
gen3afile   <- system.file("extdata", "set3a.imp",    package = "BinaryDosage")
gen3asample <- system.file("extdata", "set3a.sample", package = "BinaryDosage")
gen3bfile   <- system.file("extdata", "set3b.imp",    package = "BinaryDosage")
gen3bsample <- system.file("extdata", "set3b.sample", package = "BinaryDosage")

# --- Convert to binary dosage ---
bdfile1a <- tempfile(); bdfile1b <- tempfile()
bdfile3a <- tempfile(); bdfile3b <- tempfile()

vcftobdlegacy(vcffiles = c(vcf1afile, vcf1ainfo), bdfiles = bdfile1a)
vcftobdlegacy(vcffiles = c(vcf1bfile, vcf1binfo), bdfiles = bdfile1b)
gentobd(genfiles = c(gen3afile, gen3asample), snpcolumns = c(0L, 2L:5L), bdfiles = bdfile3a)
gentobd(genfiles = c(gen3bfile, gen3bsample), snpcolumns = c(0L, 2L:5L), bdfiles = bdfile3b)

# --- Merge ---
mergebd1 <- tempfile(); mergebd3 <- tempfile()
bdmerge(mergefiles = mergebd1, bdfiles = c(bdfile1a, bdfile1b))
bdmerge(mergefiles = mergebd3, bdfiles = c(bdfile3a, bdfile3b))

# --- Apply function ---
mergebd1info <- getbdinfo(mergebd1)
mergebd3info <- getbdinfo(mergebd3)

aaf1 <- unlist(bdapply(mergebd1info, getaaf))
aaf3 <- unlist(bdapply(mergebd3info, getaaf))

aaf <- cbind(mergebd1info$snps[, c("chromosome", "location", "snpid")],
             aaf_vcf = aaf1, aaf_gen = aaf3)
knitr::kable(aaf, caption = "AAF: VCF vs GEN sources", digits = 4, row.names = FALSE)

## ----full_workflow_snp, message = FALSE, warning = FALSE----------------------
# --- Extract SNP 6 ---
set1snp6 <- getsnp(mergebd1info, 6L)
set3snp6 <- getsnp(mergebd3info, 6L)

snpdf <- data.frame(
  subjectid = mergebd1info$samples$sid,
  dosage_vcf = unlist(set1snp6),
  dosage_gen = unlist(set3snp6)
)
knitr::kable(head(snpdf, 10),
             caption = "SNP 6 dosage: VCF vs GEN (first 10 subjects)",
             digits = 4, row.names = FALSE)

