Dplyr package

Basic data operations

# https://sejdemyr.github.io/r-tutorials/basics/tables-in-r/

# Select a certain variable
data %>% select ('My Label' = colName)

data %>% filter (colName = myValue)

# use of pipe to calculate new column with cumulative values 
data %>% mutate(cum = cumsum(colName))

# use of pipe to convert all columns (e.g. round up numeric values)
data %>% mutate_at(vars(-sample),funs(round(., 2)))

# use pipe to order values
data %>% arrange(desc(colName))

# Transform data to long format; key is name of new column containing gathered columns, value is name of new column containing data and all other vars represent columns to be gathered (or not with minus in front)
data %>% gather(key = "variable", value = "value", -date)
data %>% gather(key = "variable", value = "value", 1:3)

# Transform data to wide format; key is name of column containing values for newly created columns, value is name of column with data to be spread across newly created columns
data %>% spread(key = "variable", value = "value")

# Concatenate two datasets
one <- mtcars[1:4, ]
two <- mtcars[11:14, ]
bind_rows(list(one, two), .id = "id")

bind_cols(data.frame(x = 1), data.frame(y = 1:2))

Un/Grouping data

# rowwise application of function e.g. partially match string with reference
data = data %>% rowwise() %>%   mutate(partMatch = pmatch(Alt, Ref))

# remove grouping 
data %>% ungroup()
# remove grouping by resetting to dataframe
class(data) = c("data", "data.frame") 

Common data aggregation tasks

# Calculating groupwise frequencies and cumulative proportions 
data %>% group_by(sample) %>% count(kind) %>% mutate(freq = n / sum(n), cum = cumsum(freq))

# Calculating groupwise descriptive stats
data %>% group_by(sample) %>% summarize(n = n(), min = min(AF), mean = mean (AF),median = median(AF), max = max(AF)) %>%  select(sample, min, max, median, mean) %>%  mutate_at(vars(-sample),funs(round(., 2)))

# count group wise rows, and add these as new columns at end of table
da %>% group_by(sampleId, Overlap) %>% summarize(n = n()) %>%  spread(Overlap, n)
da %>% group_by(sampleId, Overlap) %>% tally %>% spread(Overlap, n)
da %>% count(sampleId, Overlap) %>%  spread(Overlap, n)


# exchange suffix of all columns ending with x with tnac
da %>% setNames(gsub("x$",'tnac', names(.)))

Conditionals


# Set conditionally values
da %>% mutate(Overlap = case_when(
    in_tn == TRUE &  in_to == FALSE ~ "TN_only",
    in_tn == FALSE &  in_to == TRUE ~ "TO_only",
    TRUE ~ "Overlap"))

Apply rowwise function to data

````R

Add column checking if all values in row are not na’s

dWide2 %>% select(AF_60, 30%) %>% mutate ( both = apply(., 1, function(x) all(!is.na(x))))


## Tidyr
````R
# Create new columns by splitting string from existing column
df%>%tidyr::separate(Position, c("chr", "rest"), sep =":")