Data table package

Reference documentation

https://www.rdocumentation.org/packages/data.table https://rdatatable.gitlab.io/data.table/reference/data.table.html

Basic data operations

# subselection of columns
dtb[,.(reads, i.reads, perc, pid)]

# removal of columns
df3[,pid:=NULL]

# create new columns with columnwise calculation 
cols = c('delPer','insPer', 'snvPer','mnvPer')
#d[, c('delPer','insPer', 'snvPer','mnvPer') := list(deletion / total, insertion / total, SNV / total, MNV / total)]
d[, (cols) := .SD / total, .SDcols = c('deletion', 'insertion', 'SNV', 'MNV')]#same

# Calculate median by three cols
df.cov[,median(reads), by = .(chr, start, stop)][order(chr,start)]

# Calculate by variable levels sum column and max
dt.all[,list(sum(count),max(total)), by = .( sampleId)]  

# Add columjn with rowindex
dt.sum[, rowNumber := 1:.N] 

# Get frequencies by 
t = as.data.table(table(dt.pileAll$label,dt.pileAll$index, dt.pileAll$sid))
tn = dt.pileAll[,.N, by = c('sid', 'label', 'index')] # same

melt and cast


# Vertically expand rows with sid by VariantType
dcast(d, sid~VariantType)

# Vertically expand rows with unique combi sid and index by label specific value N
dcast(t,index+sid~label, value.var = 'N')

# Horizontally bind measure.vars on basis of sid 
melt(d, id = 'sid', measure.vars = c('insPer','snvPer', 'mnvPer' ))

Joining two data tables

https://rstudio-pubs-static.s3.amazonaws.com/52230_5ae0d25125b544caab32f75f0360e775.html

# one key
A = data.table(a=1:4, b= 12:15)
B= data.table(a=2:3, b= 13:14, key = 'a')
A[B, on = 'a']


#two or more keys
setkey(dt.cov, chr, start, stop, index)
setkey(dt.map, chr, start, stop, index)
df.cov[dt.map]


merge(df.cov, dt.map, all.x=T)
merge(df.cov, dt.map, all.x=T)
merge(df.cov, dt.map, all=T)

# Merge different data tables with same columns
merged = rbindlist(list(dt.hor, dt.ser, dt.ser2))

Apply function to 1 column

# Apply rowwise function 
dt[, newCol := do.call(str_remove_all, list(string = oldCol, pattern = ' NA')

# Apply rowwise custom function 
smaller2 <- function(x){
    return (x < 2)
}

x <- data.table(a = c(1:3, 1), b = c('12 13', '14 15', '16 17', '18 19'))
x[, sm2  := do.call(smaller2, list (x = a))]
x[, sm3  := lapply(a, smaller2)]


Feed 2 or more columns into functions


isCT <- function(x, y){
    return (x == "C" & y == 'T')
}
dt.both[,mapply(isCT,Reference,Variant)]

library(data.table)
t = data.table(a=c(rep(LETTERS[1],5),rep(LETTERS[2],5)), b =seq(1,21,5), c = seq(1+4,21+4,5), d= 1:10)
tt = data.table(aa=c("A","B"), bb =c(7, 8))
             

f <- function (colA,colB, dt.ext){
  return (dt.ext[a==colA & b<=colB & c>=colB,d])
}


mapply(f2, colA = tt$aa, colB = tt$bb, MoreArgs = list(t))
tt[,mapply(f, aa, bb, MoreArgs = list(dt.ext = t))]

# Apply row-wise apply with constant input
dt[,index2 := mapply(findRegionIndex, chr, position, MoreArgs = list (regions=dt.bed))]

# Apply row wise selection of first element of split function
dt.cnv[,name_short := lapply(stringr::str_split(dt.cnv$Name, '_'), `[`, i =c(1))]

Add two or more values from function call as columns


# Apply existing function rowwise and map outcome on two columns
x[, c('b1','b2')  := do.call(Map, c(f = c, strsplit(b, ' ')))] # combine result of splitting function
x[, c('b1','b2') := data.table(t(x[,mapply(c, strsplit(b, split = ' ') , SIMPLIFY = FALSE)])) #same

# Add two return values as columns to data table
myfun <- function (y, v, g) 
{
  ret1 = y + v + length(g)
  ret2 = y - v + length(g)
  return(list(r1 = ret1, r2 = ret2))
}

x[,c("v1", 'v2') := data.table(t(mapply(myfun, colA, colB, colC)))]



Calc row-wise sum of comma-separated values


dt.dp1 = data.table(Freq = list("999",  "99,9,0" , "99,93,0"))
dt.dp1[, dp1 := sapply(Freq, 'str_split', pattern = ',' )]
dt.dp1[, dp1 := lapply(lapply(dp1, as.numeric), sum)]