A Tutorial to Multi-view Sequential Canonical Covariance Analysis

This tutorial uses an empirical example to describe what we can achieve from MultiSeqCCoA. The package was written by Ruoqing Zhu. The reference of the paper that put forward this package is recommended as

This paper is currently under review. For a detailed introduction to MultiSeqCCoA, please refer to MultiSeqCCoA.pptx.

Import libraries

All data filter and rearrangement are based on package data.table.
Anytime package is used for converting Unix time to real time.

library(MultiSeqCCA)
library(data.table)
library(anytime)

Read Data

The sample dataset is available: UA_comments.csv, AA_comments.csv, SW_comments.csv.

UA_comments <- fread("UA_comments.csv")
AA_comments <- fread("AA_comments.csv")
SW_comments <- fread("SW_comments.csv")

Data structure

Column Names

colnames(UA_comments)
#>  [1] "comment_id"    "post_id"       "company_name"  "like_num"     
#>  [5] "word_count"    "comment_count" "unix_time"     "happy"        
#>  [9] "angry"         "excited"       "sad"           "fear"         
#> [13] "bored"

Variable Descriptions

var_des <- fread("var_description.csv")
knitr::kable(var_des)

comment_id	Comment ID
post_id	Post ID of this comment
company_name	Name of the airline company
like_num	Number of like’s of this comment
word_count	Word count of this comment
comment_count	Comment count of this comment
unix_time	Unix time
happy	Happy score
angry	Angry score
excited	Excited score
sad	Sad score
fear	Fear score
bored	Bored score

Data structure

head(UA_comments)
#>                             comment_id                           post_id
#> 1: 10155190762835781_10155334944980781 199504650087085_10155190762835781
#> 2:   1070677129636495_1466023640101840  199504650087085_1070677129636495
#> 3:   1273344189369787_1442256722478532  199504650087085_1273344189369787
#> 4:   1286880621349477_1443113662392838  199504650087085_1286880621349477
#> 5:   1309018619135677_1444949168875954  199504650087085_1309018619135677
#> 6:    1315890551781817_166234563942792  199504650087085_1315890551781817
#>    company_name like_num word_count comment_count  unix_time       happy
#> 1:       United      367         78             0 1501639364 0.009258472
#> 2:       United      364          7             0 1503852511 0.230786258
#> 3:       United     1295         39             0 1501563452 0.022499286
#> 4:       United     1852          2             0 1501650324 0.343278583
#> 5:       United      820          5             0 1501827795 0.158872347
#> 6:       United      176          4             2 1503897456 0.258525408
#>         angry    excited        sad       fear      bored
#> 1: 0.58718922 0.01269869 0.22660297 0.10853215 0.05571850
#> 2: 0.09788417 0.20977972 0.09496059 0.29169671 0.07489255
#> 3: 0.69481051 0.03101522 0.17317751 0.06765962 0.01083785
#> 4: 0.06833982 0.13003306 0.07020041 0.35412971 0.03401842
#> 5: 0.17569327 0.19441766 0.16607406 0.18623707 0.11870558
#> 6: 0.08925582 0.26439628 0.19558225 0.19224025 0.00000000

Data Rearrangement

Combine three views

company_tensor <- rbind(UA_comments, AA_comments, SW_comments)

Create an aggregate function

aggregation <- function(dat, interval, m){
  ## create a null data table with the same structure of dat
  result <- dat[word_count > max(word_count),]
  ## find all unique time points with at least one comment, and sort these time points
  time_points <- sort(unique(dat$unix_time))
  ## aggregate every m minutes
  for (time_point in seq(from = time_points[1]+interval, to = tail(time_points,1) - interval, by = 60*m)){
    temp = dat[unix_time < time_point + interval & unix_time >= time_point - interval,
               list(like_num = mean(like_num),
                 comment_count = mean(comment_count),
                 word_count = mean(word_count),
                 happy = mean(happy),
                 angry = mean(angry),
                 excited = mean(excited),
                 sad = mean(sad),
                 fear = mean(fear),
                 bored = mean(bored),
                 unix_time = time_point),
               by = c("company_name")]
    if (length(temp$company_name) == 3) {result = rbind(temp, result)}
    print(time_point)
  }
  return(result[order(unix_time)])
}

Aggregation

agg_result <- aggregation(company_tensor[,-c("comment_id","post_id")],1800*3, 15)

The following section is optional

We can add weight to comments with different like numbers or comment number

agg_result[,c(5:10)] <- agg_result[,c(5:10)] *sqrt(agg_result$like_num + agg_result$comment_count)
agg_result <- agg_result[,-c("like_num","comment_count","word_count")]

Making the time points uniformly distributed (optional)
To execute MultiSeqCCoA, we need each view with the same number of observations. The above aggregation algorithm can guarantee the three views with the same number of time points. However, these time points are not uniformly distributed. At a specific time point, if there are fewer than three companies with comments, this time point will be disregarded.

get_timepoints <- function(unix_time, step = 60*15){
  temp <- sort(unique(unix_time))
  return (seq(from = temp[1] + step, to = tail(temp,1)-step, by = step))
}

all_time_points <- get_timepoints(agg_result$unix_time)

all_time_points <- data.table(all_time_points)
setnames(all_time_points,"all_time_points","unix_time")

get full table

UA_table <- merge(all_time_points, agg_result[company_name == "United"],by = "unix_time", all.x = TRUE)
AA_table <- merge(all_time_points, agg_result[company_name == "AmericanAirlines"],by = "unix_time", all.x = TRUE)
SW_table <- merge(all_time_points, agg_result[company_name == "Southwest"],by = "unix_time", all.x = TRUE)

smoothing: Zeileis, A., & Grothendieck, G. (2005). zoo: S3 infrastructure for regular and irregular time series. arXiv preprint math/0505527.

my_fillna <- function(a){
  return(zoo::na.fill(a,"extend"))
}


UA_table[, c(3:8) := lapply(.SD, my_fillna), .SDcols = c(3:8)]
UA_table[,company_name:= "United"]

AA_table[, c(3:8) := lapply(.SD, my_fillna), .SDcols = c(3:8)]
AA_table[,company_name:= "AmericanAirlines"]

SW_table[, c(3:8) := lapply(.SD, my_fillna), .SDcols = c(3:8)]
SW_table[,company_name:= "Southwest"]

agg_result_uniform <- rbind(UA_table, AA_table, SW_table)

split to matrix

for (company in c("United","AmericanAirlines",
                  "Southwest")) {
  dat <- agg_result_uniform[company_name == company, -c("company_name","unix_time")]
  dat <- as.matrix(dat)
  dat <- scale(dat, center = FALSE)
  assign(paste(company,"matrix",sep = "_") ,dat)
}

Execute MultiSeqCCoA

Tuning Parameters

Bandwidth, Directions

# The aggregate unit is 15 minites, so the bandwidth defined as follows represents one week
bandwidth <- 24*7*4

# Set the number of directions (output of the dimension-reduction results) as 1
directions <- 1

MultiSeqCCoA

# The input is a list of matrics with the same number of rows (time points), the number of columns (variables) does not need to be the same
input <- list(United_matrix,AmericanAirlines_matrix,Southwest_matrix)


output <- MultiSeqCCA(input, K = directions, bw = bandwidth, control = list("tau" = 1e-3),initial = "PMA", maxitr = 1000, verbose = TRUE, ncore = 0)

Visualize the results


par(mar = c(1, 1, 0, 0))
for (i in c(1:3)) {
  plot.MultiSeqCCA(output, view = i, var_loc = c(1:6), tr_rank = 1,tr_weight = TRUE, type = "l", lty = 1:3, lwd = 3)
}
#> On view 1, trace rank 1, plotting variables: 
#> happy angry excited

#> On view 2, trace rank 1, plotting variables: 
#> happy angry excited

#> On view 3, trace rank 1, plotting variables: 
#> happy angry excited

A Tutorial to Multi-view Sequential Canonical Covariance Analysis

Hongfei Li

Last Updated: May 21, 2019

Import libraries

Read Data

Data structure

Data Rearrangement

Combine three views

Create an aggregate function

Aggregation

The following section is optional

split to matrix

Execute MultiSeqCCoA

Tuning Parameters

MultiSeqCCoA

Visualize the results