Reference documentation
https://www.rdocumentation.org/packages/data.table https://rdatatable.gitlab.io/data.table/reference/data.table.html
Basic data operations
# subselection of columns
dtb[,.(reads, i.reads, perc, pid)]
# removal of columns
df3[,pid:=NULL]
# create new columns with columnwise calculation
cols = c('delPer','insPer', 'snvPer','mnvPer')
#d[, c('delPer','insPer', 'snvPer','mnvPer') := list(deletion / total, insertion / total, SNV / total, MNV / total)]
d[, (cols) := .SD / total, .SDcols = c('deletion', 'insertion', 'SNV', 'MNV')]#same
# Calculate median by three cols
df.cov[,median(reads), by = .(chr, start, stop)][order(chr,start)]
# Calculate by variable levels sum column and max
dt.all[,list(sum(count),max(total)), by = .( sampleId)]
# Add columjn with rowindex
dt.sum[, rowNumber := 1:.N]
# Get frequencies by
t = as.data.table(table(dt.pileAll$label,dt.pileAll$index, dt.pileAll$sid))
tn = dt.pileAll[,.N, by = c('sid', 'label', 'index')] # same
melt and cast
# Vertically expand rows with sid by VariantType
dcast(d, sid~VariantType)
# Vertically expand rows with unique combi sid and index by label specific value N
dcast(t,index+sid~label, value.var = 'N')
# Horizontally bind measure.vars on basis of sid
melt(d, id = 'sid', measure.vars = c('insPer','snvPer', 'mnvPer' ))
Joining two data tables
https://rstudio-pubs-static.s3.amazonaws.com/52230_5ae0d25125b544caab32f75f0360e775.html
# one key
A = data.table(a=1:4, b= 12:15)
B= data.table(a=2:3, b= 13:14, key = 'a')
A[B, on = 'a']
#two or more keys
setkey(dt.cov, chr, start, stop, index)
setkey(dt.map, chr, start, stop, index)
df.cov[dt.map]
merge(df.cov, dt.map, all.x=T)
merge(df.cov, dt.map, all.x=T)
merge(df.cov, dt.map, all=T)
# Merge different data tables with same columns
merged = rbindlist(list(dt.hor, dt.ser, dt.ser2))
Apply function to 1 column
# Apply rowwise function
dt[, newCol := do.call(str_remove_all, list(string = oldCol, pattern = ' NA')
# Apply rowwise custom function
smaller2 <- function(x){
return (x < 2)
}
x <- data.table(a = c(1:3, 1), b = c('12 13', '14 15', '16 17', '18 19'))
x[, sm2 := do.call(smaller2, list (x = a))]
x[, sm3 := lapply(a, smaller2)]
# on-the-fly definition of
dt_all[,lpass := sapply(finfo , function (x) all(liberal %in% unlist(x)) )]
Feed 2 or more columns into functions
isCT <- function(x, y){
return (x == "C" & y == 'T')
}
dt.both[,mapply(isCT,Reference,Variant)]
library(data.table)
t = data.table(a=c(rep(LETTERS[1],5),rep(LETTERS[2],5)), b =seq(1,21,5), c = seq(1+4,21+4,5), d= 1:10)
tt = data.table(aa=c("A","B"), bb =c(7, 8))
f <- function (colA,colB, dt.ext){
return (dt.ext[a==colA & b<=colB & c>=colB,d])
}
mapply(f2, colA = tt$aa, colB = tt$bb, MoreArgs = list(t))
tt[,mapply(f, aa, bb, MoreArgs = list(dt.ext = t))]
# Apply row-wise apply with constant input
dt[,index2 := mapply(findRegionIndex, chr, position, MoreArgs = list (regions=dt.bed))]
# Apply row wise selection of first element of split function
dt.cnv[,name_short := lapply(stringr::str_split(dt.cnv$Name, '_'), `[`, i =c(1))]
Add two or more values from function call as columns
# Apply existing function rowwise and map outcome on two columns
x[, c('b1','b2') := do.call(Map, c(f = c, strsplit(b, ' ')))] # combine result of splitting function
x[, c('b1','b2') := data.table(t(x[,mapply(c, strsplit(b, split = ' ') , SIMPLIFY = FALSE)])) #same
# Add two return values as columns to data table
myfun <- function (y, v, g)
{
ret1 = y + v + length(g)
ret2 = y - v + length(g)
return(list(r1 = ret1, r2 = ret2))
}
x[,c("v1", 'v2') := data.table(t(mapply(myfun, colA, colB, colC)))]
Calc row-wise sum of comma-separated values
dt.dp1 = data.table(Freq = list("999", "99,9,0" , "99,93,0"))
dt.dp1[, dp1 := sapply(Freq, 'str_split', pattern = ',' )]
dt.dp1[, dp1 := str_split(Freq, pattern = ',' )]
dt.dp1[, dp1 := lapply(lapply(dp1, as.numeric), sum)]
Filter row-wise certain values from list
mylist = list(c("contamination", "panel_of_normals", "slippage", "weak_evidence"),
c("contamination", "panel_of_normals", "slippage", "weak_evidence"),
c("contamination", "orientation", "weak_evidence"))
filter = c("contamination", "slippage")
lapply(mylist, function (x) which(!x %in% filter))
# Can also be applied to data table
dt_all[,ff2 := lapply(ff, function(x) x[which(! x %in% mutect_filters)])]