# Reference documentation

## Basic data operations

``````# subselection of columns

# removal of columns
df3[,pid:=NULL]

# create new columns with columnwise calculation
cols = c('delPer','insPer', 'snvPer','mnvPer')
#d[, c('delPer','insPer', 'snvPer','mnvPer') := list(deletion / total, insertion / total, SNV / total, MNV / total)]
d[, (cols) := .SD / total, .SDcols = c('deletion', 'insertion', 'SNV', 'MNV')]#same

# Calculate median by three cols
df.cov[,median(reads), by = .(chr, start, stop)][order(chr,start)]

# Calculate by variable levels sum column and max
dt.all[,list(sum(count),max(total)), by = .( sampleId)]

dt.sum[, rowNumber := 1:.N]

# Get frequencies by
t = as.data.table(table(dt.pileAll\$label,dt.pileAll\$index, dt.pileAll\$sid))
tn = dt.pileAll[,.N, by = c('sid', 'label', 'index')] # same
``````

## melt and cast

``````
# Vertically expand rows with sid by VariantType
dcast(d, sid~VariantType)

# Vertically expand rows with unique combi sid and index by label specific value N
dcast(t,index+sid~label, value.var = 'N')

# Horizontally bind measure.vars on basis of sid
melt(d, id = 'sid', measure.vars = c('insPer','snvPer', 'mnvPer' ))

``````

## Joining two data tables

https://rstudio-pubs-static.s3.amazonaws.com/52230_5ae0d25125b544caab32f75f0360e775.html

``````# one key
A = data.table(a=1:4, b= 12:15)
B= data.table(a=2:3, b= 13:14, key = 'a')
A[B, on = 'a']

#two or more keys
setkey(dt.cov, chr, start, stop, index)
setkey(dt.map, chr, start, stop, index)
df.cov[dt.map]

merge(df.cov, dt.map, all.x=T)
merge(df.cov, dt.map, all.x=T)
merge(df.cov, dt.map, all=T)

# Merge different data tables with same columns
merged = rbindlist(list(dt.hor, dt.ser, dt.ser2))

``````

# Apply function to 1 column

``````# Apply rowwise function
dt[, newCol := do.call(str_remove_all, list(string = oldCol, pattern = ' NA')

# Apply rowwise custom function
smaller2 <- function(x){
return (x < 2)
}

x <- data.table(a = c(1:3, 1), b = c('12 13', '14 15', '16 17', '18 19'))
x[, sm2  := do.call(smaller2, list (x = a))]
x[, sm3  := lapply(a, smaller2)]

# on-the-fly definition of
dt_all[,lpass := sapply(finfo , function (x) all(liberal %in% unlist(x)) )]

``````

# Feed 2 or more columns into functions

``````
isCT <- function(x, y){
return (x == "C" & y == 'T')
}
dt.both[,mapply(isCT,Reference,Variant)]

library(data.table)
t = data.table(a=c(rep(LETTERS[1],5),rep(LETTERS[2],5)), b =seq(1,21,5), c = seq(1+4,21+4,5), d= 1:10)
tt = data.table(aa=c("A","B"), bb =c(7, 8))

f <- function (colA,colB, dt.ext){
return (dt.ext[a==colA & b<=colB & c>=colB,d])
}

mapply(f2, colA = tt\$aa, colB = tt\$bb, MoreArgs = list(t))
tt[,mapply(f, aa, bb, MoreArgs = list(dt.ext = t))]

# Apply row-wise apply with constant input
dt[,index2 := mapply(findRegionIndex, chr, position, MoreArgs = list (regions=dt.bed))]

# Apply row wise selection of first element of split function
dt.cnv[,name_short := lapply(stringr::str_split(dt.cnv\$Name, '_'), `[`, i =c(1))]
``````

# Add two or more values from function call as columns

``````
# Apply existing function rowwise and map outcome on two columns
x[, c('b1','b2')  := do.call(Map, c(f = c, strsplit(b, ' ')))] # combine result of splitting function
x[, c('b1','b2') := data.table(t(x[,mapply(c, strsplit(b, split = ' ') , SIMPLIFY = FALSE)])) #same

# Add two return values as columns to data table
myfun <- function (y, v, g)
{
ret1 = y + v + length(g)
ret2 = y - v + length(g)
return(list(r1 = ret1, r2 = ret2))
}

x[,c("v1", 'v2') := data.table(t(mapply(myfun, colA, colB, colC)))]

``````

## Calc row-wise sum of comma-separated values

``````
dt.dp1 = data.table(Freq = list("999",  "99,9,0" , "99,93,0"))
dt.dp1[, dp1 := sapply(Freq, 'str_split', pattern = ',' )]
dt.dp1[, dp1 := str_split(Freq, pattern = ',' )]
dt.dp1[, dp1 := lapply(lapply(dp1, as.numeric), sum)]

``````

## Filter row-wise certain values from list

``````
mylist = list(c("contamination", "panel_of_normals", "slippage", "weak_evidence"),
c("contamination", "panel_of_normals", "slippage", "weak_evidence"),
c("contamination", "orientation", "weak_evidence"))

filter = c("contamination", "slippage")

lapply(mylist, function (x) which(!x %in% filter))

# Can also be applied to data table
dt_all[,ff2 := lapply(ff, function(x) x[which(! x %in% mutect_filters)])]
``````