A Tutorial to Multi-view Sequential Canonical Covariance Analysis

Hongfei Li

Last Updated: May 21, 2019

This tutorial uses an empirical example to describe what we can achieve from MultiSeqCCoA. The package was written by Ruoqing Zhu. The reference of the paper that put forward this package is recommended as

“Cao X, Folta T, Li H, Zhu R (2019) A New Method to Understand the Online Word of Mouth Dynamics: Multi-View Sequential Canonical Covariance Analysis”

For all the authors’ information, please see: Xian Cao Timothy Folta Hongfei Li Ruoqing Zhu

This paper is currently under review. For a detailed introduction to MultiSeqCCoA, please refer to MultiSeqCCoA.pptx.

Import libraries

library(MultiSeqCCA)
library(data.table)
library(anytime)

Read Data

The sample dataset is available: UA_comments.csv, AA_comments.csv, SW_comments.csv.

UA_comments <- fread("UA_comments.csv")
AA_comments <- fread("AA_comments.csv")
SW_comments <- fread("SW_comments.csv")

Data structure

Column Names

colnames(UA_comments)
#>  [1] "comment_id"    "post_id"       "company_name"  "like_num"     
#>  [5] "word_count"    "comment_count" "unix_time"     "happy"        
#>  [9] "angry"         "excited"       "sad"           "fear"         
#> [13] "bored"

Variable Descriptions

var_des <- fread("var_description.csv")
knitr::kable(var_des)
comment_id Comment ID
post_id Post ID of this comment
company_name Name of the airline company
like_num Number of like’s of this comment
word_count Word count of this comment
comment_count Comment count of this comment
unix_time Unix time
happy Happy score
angry Angry score
excited Excited score
sad Sad score
fear Fear score
bored Bored score

Data structure

head(UA_comments)
#>                             comment_id                           post_id
#> 1: 10155190762835781_10155334944980781 199504650087085_10155190762835781
#> 2:   1070677129636495_1466023640101840  199504650087085_1070677129636495
#> 3:   1273344189369787_1442256722478532  199504650087085_1273344189369787
#> 4:   1286880621349477_1443113662392838  199504650087085_1286880621349477
#> 5:   1309018619135677_1444949168875954  199504650087085_1309018619135677
#> 6:    1315890551781817_166234563942792  199504650087085_1315890551781817
#>    company_name like_num word_count comment_count  unix_time       happy
#> 1:       United      367         78             0 1501639364 0.009258472
#> 2:       United      364          7             0 1503852511 0.230786258
#> 3:       United     1295         39             0 1501563452 0.022499286
#> 4:       United     1852          2             0 1501650324 0.343278583
#> 5:       United      820          5             0 1501827795 0.158872347
#> 6:       United      176          4             2 1503897456 0.258525408
#>         angry    excited        sad       fear      bored
#> 1: 0.58718922 0.01269869 0.22660297 0.10853215 0.05571850
#> 2: 0.09788417 0.20977972 0.09496059 0.29169671 0.07489255
#> 3: 0.69481051 0.03101522 0.17317751 0.06765962 0.01083785
#> 4: 0.06833982 0.13003306 0.07020041 0.35412971 0.03401842
#> 5: 0.17569327 0.19441766 0.16607406 0.18623707 0.11870558
#> 6: 0.08925582 0.26439628 0.19558225 0.19224025 0.00000000

Data Rearrangement

Combine three views

company_tensor <- rbind(UA_comments, AA_comments, SW_comments)

Create an aggregate function

aggregation <- function(dat, interval, m){
  ## create a null data table with the same structure of dat
  result <- dat[word_count > max(word_count),]
  ## find all unique time points with at least one comment, and sort these time points
  time_points <- sort(unique(dat$unix_time))
  ## aggregate every m minutes
  for (time_point in seq(from = time_points[1]+interval, to = tail(time_points,1) - interval, by = 60*m)){
    temp = dat[unix_time < time_point + interval & unix_time >= time_point - interval,
               list(like_num = mean(like_num),
                 comment_count = mean(comment_count),
                 word_count = mean(word_count),
                 happy = mean(happy),
                 angry = mean(angry),
                 excited = mean(excited),
                 sad = mean(sad),
                 fear = mean(fear),
                 bored = mean(bored),
                 unix_time = time_point),
               by = c("company_name")]
    if (length(temp$company_name) == 3) {result = rbind(temp, result)}
    print(time_point)
  }
  return(result[order(unix_time)])
}

Aggregation

agg_result <- aggregation(company_tensor[,-c("comment_id","post_id")],1800*3, 15)

The following section is optional

  • We can add weight to comments with different like numbers or comment number
agg_result[,c(5:10)] <- agg_result[,c(5:10)] *sqrt(agg_result$like_num + agg_result$comment_count)
agg_result <- agg_result[,-c("like_num","comment_count","word_count")]
  • Making the time points uniformly distributed (optional)

  • To execute MultiSeqCCoA, we need each view with the same number of observations. The above aggregation algorithm can guarantee the three views with the same number of time points. However, these time points are not uniformly distributed. At a specific time point, if there are fewer than three companies with comments, this time point will be disregarded.

get_timepoints <- function(unix_time, step = 60*15){
  temp <- sort(unique(unix_time))
  return (seq(from = temp[1] + step, to = tail(temp,1)-step, by = step))
}

all_time_points <- get_timepoints(agg_result$unix_time)

all_time_points <- data.table(all_time_points)
setnames(all_time_points,"all_time_points","unix_time")
  • get full table
UA_table <- merge(all_time_points, agg_result[company_name == "United"],by = "unix_time", all.x = TRUE)
AA_table <- merge(all_time_points, agg_result[company_name == "AmericanAirlines"],by = "unix_time", all.x = TRUE)
SW_table <- merge(all_time_points, agg_result[company_name == "Southwest"],by = "unix_time", all.x = TRUE)
  • smoothing: Zeileis, A., & Grothendieck, G. (2005). zoo: S3 infrastructure for regular and irregular time series. arXiv preprint math/0505527.
my_fillna <- function(a){
  return(zoo::na.fill(a,"extend"))
}


UA_table[, c(3:8) := lapply(.SD, my_fillna), .SDcols = c(3:8)]
UA_table[,company_name:= "United"]

AA_table[, c(3:8) := lapply(.SD, my_fillna), .SDcols = c(3:8)]
AA_table[,company_name:= "AmericanAirlines"]

SW_table[, c(3:8) := lapply(.SD, my_fillna), .SDcols = c(3:8)]
SW_table[,company_name:= "Southwest"]

agg_result_uniform <- rbind(UA_table, AA_table, SW_table)

split to matrix

for (company in c("United","AmericanAirlines",
                  "Southwest")) {
  dat <- agg_result_uniform[company_name == company, -c("company_name","unix_time")]
  dat <- as.matrix(dat)
  dat <- scale(dat, center = FALSE)
  assign(paste(company,"matrix",sep = "_") ,dat)
}

Execute MultiSeqCCoA

Tuning Parameters

# The aggregate unit is 15 minites, so the bandwidth defined as follows represents one week
bandwidth <- 24*7*4

# Set the number of directions (output of the dimension-reduction results) as 1
directions <- 1

MultiSeqCCoA

# The input is a list of matrics with the same number of rows (time points), the number of columns (variables) does not need to be the same
input <- list(United_matrix,AmericanAirlines_matrix,Southwest_matrix)


output <- MultiSeqCCA(input, K = directions, bw = bandwidth, control = list("tau" = 1e-3),initial = "PMA", maxitr = 1000, verbose = TRUE, ncore = 0)

Visualize the results


par(mar = c(1, 1, 0, 0))
for (i in c(1:3)) {
  plot.MultiSeqCCA(output, view = i, var_loc = c(1:6), tr_rank = 1,tr_weight = TRUE, type = "l", lty = 1:3, lwd = 3)
}
#> On view 1, trace rank 1, plotting variables: 
#> happy angry excited

#> On view 2, trace rank 1, plotting variables: 
#> happy angry excited

#> On view 3, trace rank 1, plotting variables: 
#> happy angry excited