This tutorial uses an empirical example to describe what we can achieve from MultiSeqCCoA. The package was written by Ruoqing Zhu. The reference of the paper that put forward this package is recommended as
“Cao X, Folta T, Li H, Zhu R (2019) A New Method to Understand the Online Word of Mouth Dynamics: Multi-View Sequential Canonical Covariance Analysis”
For all the authors’ information, please see: Xian Cao Timothy Folta Hongfei Li Ruoqing Zhu
This paper is currently under review. For a detailed introduction to MultiSeqCCoA, please refer to MultiSeqCCoA.pptx.
library(MultiSeqCCA)
library(data.table)
library(anytime)
The sample dataset is available: UA_comments.csv, AA_comments.csv, SW_comments.csv.
UA_comments <- fread("UA_comments.csv")
AA_comments <- fread("AA_comments.csv")
SW_comments <- fread("SW_comments.csv")
Column Names
colnames(UA_comments)
#> [1] "comment_id" "post_id" "company_name" "like_num"
#> [5] "word_count" "comment_count" "unix_time" "happy"
#> [9] "angry" "excited" "sad" "fear"
#> [13] "bored"
Variable Descriptions
var_des <- fread("var_description.csv")
knitr::kable(var_des)
comment_id | Comment ID |
---|---|
post_id | Post ID of this comment |
company_name | Name of the airline company |
like_num | Number of like’s of this comment |
word_count | Word count of this comment |
comment_count | Comment count of this comment |
unix_time | Unix time |
happy | Happy score |
angry | Angry score |
excited | Excited score |
sad | Sad score |
fear | Fear score |
bored | Bored score |
Data structure
head(UA_comments)
#> comment_id post_id
#> 1: 10155190762835781_10155334944980781 199504650087085_10155190762835781
#> 2: 1070677129636495_1466023640101840 199504650087085_1070677129636495
#> 3: 1273344189369787_1442256722478532 199504650087085_1273344189369787
#> 4: 1286880621349477_1443113662392838 199504650087085_1286880621349477
#> 5: 1309018619135677_1444949168875954 199504650087085_1309018619135677
#> 6: 1315890551781817_166234563942792 199504650087085_1315890551781817
#> company_name like_num word_count comment_count unix_time happy
#> 1: United 367 78 0 1501639364 0.009258472
#> 2: United 364 7 0 1503852511 0.230786258
#> 3: United 1295 39 0 1501563452 0.022499286
#> 4: United 1852 2 0 1501650324 0.343278583
#> 5: United 820 5 0 1501827795 0.158872347
#> 6: United 176 4 2 1503897456 0.258525408
#> angry excited sad fear bored
#> 1: 0.58718922 0.01269869 0.22660297 0.10853215 0.05571850
#> 2: 0.09788417 0.20977972 0.09496059 0.29169671 0.07489255
#> 3: 0.69481051 0.03101522 0.17317751 0.06765962 0.01083785
#> 4: 0.06833982 0.13003306 0.07020041 0.35412971 0.03401842
#> 5: 0.17569327 0.19441766 0.16607406 0.18623707 0.11870558
#> 6: 0.08925582 0.26439628 0.19558225 0.19224025 0.00000000
company_tensor <- rbind(UA_comments, AA_comments, SW_comments)
aggregation <- function(dat, interval, m){
## create a null data table with the same structure of dat
result <- dat[word_count > max(word_count),]
## find all unique time points with at least one comment, and sort these time points
time_points <- sort(unique(dat$unix_time))
## aggregate every m minutes
for (time_point in seq(from = time_points[1]+interval, to = tail(time_points,1) - interval, by = 60*m)){
temp = dat[unix_time < time_point + interval & unix_time >= time_point - interval,
list(like_num = mean(like_num),
comment_count = mean(comment_count),
word_count = mean(word_count),
happy = mean(happy),
angry = mean(angry),
excited = mean(excited),
sad = mean(sad),
fear = mean(fear),
bored = mean(bored),
unix_time = time_point),
by = c("company_name")]
if (length(temp$company_name) == 3) {result = rbind(temp, result)}
print(time_point)
}
return(result[order(unix_time)])
}
agg_result <- aggregation(company_tensor[,-c("comment_id","post_id")],1800*3, 15)
agg_result[,c(5:10)] <- agg_result[,c(5:10)] *sqrt(agg_result$like_num + agg_result$comment_count)
agg_result <- agg_result[,-c("like_num","comment_count","word_count")]
Making the time points uniformly distributed (optional)
To execute MultiSeqCCoA, we need each view with the same number of observations. The above aggregation algorithm can guarantee the three views with the same number of time points. However, these time points are not uniformly distributed. At a specific time point, if there are fewer than three companies with comments, this time point will be disregarded.
get_timepoints <- function(unix_time, step = 60*15){
temp <- sort(unique(unix_time))
return (seq(from = temp[1] + step, to = tail(temp,1)-step, by = step))
}
all_time_points <- get_timepoints(agg_result$unix_time)
all_time_points <- data.table(all_time_points)
setnames(all_time_points,"all_time_points","unix_time")
UA_table <- merge(all_time_points, agg_result[company_name == "United"],by = "unix_time", all.x = TRUE)
AA_table <- merge(all_time_points, agg_result[company_name == "AmericanAirlines"],by = "unix_time", all.x = TRUE)
SW_table <- merge(all_time_points, agg_result[company_name == "Southwest"],by = "unix_time", all.x = TRUE)
my_fillna <- function(a){
return(zoo::na.fill(a,"extend"))
}
UA_table[, c(3:8) := lapply(.SD, my_fillna), .SDcols = c(3:8)]
UA_table[,company_name:= "United"]
AA_table[, c(3:8) := lapply(.SD, my_fillna), .SDcols = c(3:8)]
AA_table[,company_name:= "AmericanAirlines"]
SW_table[, c(3:8) := lapply(.SD, my_fillna), .SDcols = c(3:8)]
SW_table[,company_name:= "Southwest"]
agg_result_uniform <- rbind(UA_table, AA_table, SW_table)
for (company in c("United","AmericanAirlines",
"Southwest")) {
dat <- agg_result_uniform[company_name == company, -c("company_name","unix_time")]
dat <- as.matrix(dat)
dat <- scale(dat, center = FALSE)
assign(paste(company,"matrix",sep = "_") ,dat)
}
# The aggregate unit is 15 minites, so the bandwidth defined as follows represents one week
bandwidth <- 24*7*4
# Set the number of directions (output of the dimension-reduction results) as 1
directions <- 1
# The input is a list of matrics with the same number of rows (time points), the number of columns (variables) does not need to be the same
input <- list(United_matrix,AmericanAirlines_matrix,Southwest_matrix)
output <- MultiSeqCCA(input, K = directions, bw = bandwidth, control = list("tau" = 1e-3),initial = "PMA", maxitr = 1000, verbose = TRUE, ncore = 0)
par(mar = c(1, 1, 0, 0))
for (i in c(1:3)) {
plot.MultiSeqCCA(output, view = i, var_loc = c(1:6), tr_rank = 1,tr_weight = TRUE, type = "l", lty = 1:3, lwd = 3)
}
#> On view 1, trace rank 1, plotting variables:
#> happy angry excited
#> On view 2, trace rank 1, plotting variables:
#> happy angry excited
#> On view 3, trace rank 1, plotting variables:
#> happy angry excited