Introduction

The mergebd function merges two or more Format 5 binary dosage files into a single Format 5 output file. The merge type is detected automatically from the input files.

Subject merge — subject IDs do not overlap across files. The output contains all subjects from every input file and the SNPs common to all files.
SNP merge — SNP IDs do not overlap across files. The output contains all SNPs from every input file and the subjects common to all files.

If both subject IDs and SNP IDs overlap across files an error is returned, since the merge type is ambiguous.

SNPs are identified by chromosome, position, reference allele, and alternate allele, regardless of the SNP ID format stored in each file.

The function takes the following parameters.

bdose_files — character vector of paths to the input .bdose files (at least two). The companion .bdi file for each is expected at paste0(bdose_files[i], ".bdi").
bdose_file — path for the output .bdose file. The companion .bdi file is written automatically to paste0(bdose_file, ".bdi").

Setup

The examples below use the bgzipped VCF file included with the package, set1a.vcf.gz, which contains data for 60 subjects and 10 SNPs on chromosome 1. All output files are written to a temporary directory.

bdose_full <- file.path(tempdir(), "full.bdose")

if (requireNamespace("vcfppR", quietly = TRUE)) {
  vcftobd(vcffile    = system.file("extdata", "set1a.vcf.gz", package = "BinaryDosage"),
          bdose_file = bdose_full)
} else {
  updatebd(bdfiles    = system.file("extdata", "vcf1a.bdose", package = "BinaryDosage"),
           bdose_file = bdose_full)
}
bd_full <- getbdinfo(bdose_full)

cat("Subjects:", nrow(bd_full$samples), "\n")
#> Subjects: 60
cat("SNPs:    ", nrow(bd_full$snps),    "\n")
#> SNPs:     10

Subject merge

A subject merge combines files that cover different subjects but the same (or overlapping) set of SNPs. The output contains all subjects and the SNPs common to every input file.

The example splits the 60-subject file into two 30-subject files using subsetbd, then merges them back together.

bdose_a   <- file.path(tempdir(), "set_a.bdose")
bdose_b   <- file.path(tempdir(), "set_b.bdose")
bdose_out <- file.path(tempdir(), "merged_subjects.bdose")

sids <- bd_full$samples$sid

subsetbd(bdfiles    = bdose_full,
         bdose_file = bdose_a,
         subjectids = sids[1:30])

subsetbd(bdfiles    = bdose_full,
         bdose_file = bdose_b,
         subjectids = sids[31:60])

mergebd(bdose_files = c(bdose_a, bdose_b),
        bdose_file  = bdose_out)

bd_a   <- getbdinfo(bdose_a)
bd_b   <- getbdinfo(bdose_b)
bd_out <- getbdinfo(bdose_out)

cat("File A subjects:", nrow(bd_a$samples),   "\n")
#> File A subjects: 30
cat("File B subjects:", nrow(bd_b$samples),   "\n")
#> File B subjects: 30
cat("Merged subjects:", nrow(bd_out$samples), "\n")
#> Merged subjects: 60
cat("Merged SNPs:    ", nrow(bd_out$snps),    "\n")
#> Merged SNPs:     10

The merged file contains all 60 subjects and all 10 SNPs.

Verifying subject order

The subjects in the merged file appear in input-file order: all subjects from the first file followed by all subjects from the second file.

knitr::kable(bd_out$samples, caption = "Subjects in merged file")

Subjects in merged file
fid	sid
	I1
	I2
	I3
	I4
	I5
	I6
	I7
	I8
	I9
	I10
	I11
	I12
	I13
	I14
	I15
	I16
	I17
	I18
	I19
	I20
	I21
	I22
	I23
	I24
	I25
	I26
	I27
	I28
	I29
	I30
	I31
	I32
	I33
	I34
	I35
	I36
	I37
	I38
	I39
	I40
	I41
	I42
	I43
	I44
	I45
	I46
	I47
	I48
	I49
	I50
	I51
	I52
	I53
	I54
	I55
	I56
	I57
	I58
	I59
	I60

SNP merge

A SNP merge combines files that cover different SNPs but the same (or overlapping) set of subjects. The output contains all SNPs and the subjects common to every input file.

The example splits the 10-SNP file into two 5-SNP files using subsetbd, then merges them back together.

bdose_snp_a   <- file.path(tempdir(), "snp_a.bdose")
bdose_snp_b   <- file.path(tempdir(), "snp_b.bdose")
bdose_snp_out <- file.path(tempdir(), "merged_snps.bdose")

locs <- bd_full$snps$location

subsetbd(bdfiles    = bdose_full,
         bdose_file = bdose_snp_a,
         locations  = locs[1:5])

subsetbd(bdfiles    = bdose_full,
         bdose_file = bdose_snp_b,
         locations  = locs[6:10])

mergebd(bdose_files = c(bdose_snp_a, bdose_snp_b),
        bdose_file  = bdose_snp_out)

bd_snp_a   <- getbdinfo(bdose_snp_a)
bd_snp_b   <- getbdinfo(bdose_snp_b)
bd_snp_out <- getbdinfo(bdose_snp_out)

cat("File A SNPs:    ", nrow(bd_snp_a$snps),   "\n")
#> File A SNPs:     5
cat("File B SNPs:    ", nrow(bd_snp_b$snps),   "\n")
#> File B SNPs:     5
cat("Merged SNPs:    ", nrow(bd_snp_out$snps),  "\n")
#> Merged SNPs:     10
cat("Merged subjects:", nrow(bd_snp_out$samples), "\n")
#> Merged subjects: 60

Verifying SNP order

SNPs appear in input-file order: all SNPs from the first file followed by all SNPs from the second file.

knitr::kable(bd_snp_out$snps, caption = "SNPs in merged file")

SNPs in merged file
chromosome	location	snpid	reference	alternate
1	10000	1:10000:C:A	C	A
1	11000	1:11000:T:C	T	C
1	12000	1:12000:T:C	T	C
1	13000	1:13000:T:C	T	C
1	14000	1:14000:G:C	G	C
1	15000	1:15000:A:C	A	C
1	16000	1:16000:G:A	G	A
1	17000	1:17000:C:A	C	A
1	18000	1:18000:C:G	C	G
1	19000	1:19000:T:G	T	G

fid	sid
	I1
	I2
	I3
	I4
	I5
	I6
	I7
	I8
	I9
	I10
	I11
	I12
	I13
	I14
	I15
	I16
	I17
	I18
	I19
	I20
	I21
	I22
	I23
	I24
	I25
	I26
	I27
	I28
	I29
	I30
	I31
	I32
	I33
	I34
	I35
	I36
	I37
	I38
	I39
	I40
	I41
	I42
	I43
	I44
	I45
	I46
	I47
	I48
	I49
	I50
	I51
	I52
	I53
	I54
	I55
	I56
	I57
	I58
	I59
	I60

fid	sid
	I1
	I2
	I3
	I4
	I5
	I6
	I7
	I8
	I9
	I10
	I11
	I12
	I13
	I14
	I15
	I16
	I17
	I18
	I19
	I20
	I21
	I22
	I23
	I24
	I25
	I26
	I27
	I28
	I29
	I30
	I31
	I32
	I33
	I34
	I35
	I36
	I37
	I38
	I39
	I40
	I41
	I42
	I43
	I44
	I45
	I46
	I47
	I48
	I49
	I50
	I51
	I52
	I53
	I54
	I55
	I56
	I57
	I58
	I59
	I60