In order to be able to analyze news articles on Facebook, we need a method to extract the necessary information on posts, comments, comment replies and reactions thereto for a specific Facebook page. We use calls to the Facebook Graph API to retrieve this information.
We load the libraries necessary to perform the extraction:
suppressPackageStartupMessages({
library(httr) # Making API calls
library(jsonlite) # JSON parsing
library(rvest) # Page scraping
library(magrittr) # Advanced piping
library(purrr) # Functional programming
library(stringr) # String manipulation
library(dplyr) # Data frame manipulation
library(tibble) # Better data frames
library(tidyr) # Frame shaping
library(lubridate) # Timestamp manipulation
library(htmlwidgets) # Javascript bindings
library(DT) # DataTables JS Library
library(pushoverr) # Push notifications
})
# Set proper working directory
if(!str_detect(getwd(), "src")) setwd("src")
knitr::opts_chunk$set(eval = FALSE)
We set up some scraping functions to help use extract information from a Facebook Page.
source("../func/01-page-scraping-functions.R")
In order to prevent bias in selecting the pages to be scraped, we rely on Socialbaker’s Ranking of Top 100 Media Facebook Pages in the Philippines by Total Likes.
map_df(
c(
"../dta/01-top-ph-media-fbpages-50.html",
"../dta/02-top-ph-media-fbpages-100.html"
),
function(html) {
read_html(x = html, encoding = "UTF-8") %>%
html_node(".brand-table-list") %>%
html_table(fill = TRUE) %>%
slice(1:50) %>%
select(
rank = X1,
page_title = X2,
page_likes = X3
) %>%
mutate(
page_title = page_title %>%
str_replace_all("^.*\n|\t", ""),
page_likes = page_likes %>%
str_replace_all("Total Fans|\n|\t|\\s", "") %>%
as.integer()
) %>%
mutate(
page_id =
read_html(x = html, encoding = "UTF-8") %>%
html_nodes(".name .acc-placeholder-img") %>%
map_chr(html_attr, "href") %>%
str_replace_all("^.*detail/", "") %>%
str_extract("^[0123456789]*-") %>%
str_replace_all("-", "")
) %>%
mutate(
page_name =
map_chr(
page_id,
function(page_id) {
message(page_id)
GET(
paste0("https://www.facebook.com/", page_id),
user_agent("Mozilla/5.0")
) %$% url %>%
str_extract("facebook.com/.*/$") %>%
str_replace_all("facebook.com/|/", "")
}
)
)
}
) -> top_100_fbpages.dt
# Manual correction for page that changed name
top_100_fbpages.dt$page_name[
top_100_fbpages.dt$page_name == "News5Features"
] <- "news5aksyon"
# Cache out
saveRDS(top_100_fbpages.dt, "../dta/04-news-pages.rds")
datatable(top_100_fbpages.dt)
We want to isolate the pages that are classified as news sites, which unfortunately can only be done manually.
newspages.ls <-
top_100_fbpages.dt$page_name[
c(2, 4, 15, 17, 19, 22, 24, 25, 28, 30, 33, 36,
39, 40, 44, 47, 50, 55, 69, 73, 81, 87)]
print(newspages.ls)
In order to start collecting information, we authenticate with a Facebook App Access Token Since page posts are public, we do not need a user access token to gather the posts and other information.
# Load authentication files
load("../bin/fb_auth.rda")
# Get authentication
getAppAccessToken(
fb_app_id = fb_app_id,
fb_app_secret = fb_app_secret
)
We then gather all the data.
# Download all new
try({
walk(
newspages.ls,
function(newspage) {
# only run if file does not exist
if (
!file.exists(paste0("../dta/03-fbpage-", newspage, ".rds")) &
!file.exists(paste0("../dta/03-fbpageDONE-", newspage, ".rds"))
) {
posts.ls <-
cachedGetFBPage(
page_name = newspage,
limit_timestamp = "2016-01-01 00:00:00",
timezone = "Asia/Manila",
cache_interval = "1 day",
posts_per_page = 5,
num_comments = 100,
num_attachments = 100,
num_reactions = 2000,
num_comments_likes = 2000,
num_comments_comments = 25,
num_comments_comments_likes = 2000,
access_token = fb.tkn
)
saveRDS(
object = posts.ls,
file = paste0("../dta/03-fbpage-", newspage, ".rds")
)
rm(posts.ls)
}
}
)
})
pushNotification()
Now that we have collected the information, we create a SQLite database so we can process it outside of memory.
# Create database
fbpages.sqlite <-
src_sqlite(
path = "../dta/03-fbpages.sqlite",
create = TRUE
)
fbpages.sqlite.con <- fbpages.sqlite$con
We then parse through each of the data files and load them into memory.
# Get all cache files
fbpage_data.ls <-
list.files("../dta", full.names = TRUE) %>% {
.[str_detect(., "/03-fbpage-.*\\.rds")]
}
# Initialize variables
posts.dt <- data_frame()
posts_attachments.dt <- data_frame()
posts_reactions.dt <- data_frame()
posts_comments.dt <- data_frame()
posts_comments_likes.dt <- data_frame()
posts_comments_comments.dt <- data_frame()
posts_comments_comments_likes.dt <- data_frame()
# Walk through each cache file
walk(
fbpage_data.ls,
function(data_path) {
# Read in data
data <- readRDS(data_path)
# Combine data frames in each shard
walk(
1:length(data),
function(i) {
# Lood shard
message("Shard ", i, ": ", appendLF = FALSE)
shard <- data[[i]]
# Walk through each element of shard
walk(
c(
"posts", "posts_attachments", "posts_reactions",
"posts_comments", "posts_comments_likes",
"posts_comments_comments",
"posts_comments_comments_likes"
),
function(name) {
# If element exists, write into appropriate data frame
if (is.data.frame(shard[[name]])) {
message(name, " ", appendLF = FALSE)
fbpages.sqlite.con %>%
db_insert_into(
table = paste0("fact_", name),
values = shard[[name]],
append = TRUE
)
}
}
)
# Remove shard from memory
rm(shard)
message("Done!")
}
)
# Remove data from memory
rm(data)
message("Done processing ", data_path, "!")
}
)
pushNotification("SQL storage complete")
We clean the data by:
dim_user
table, and dim_user_unique
tableANALYZE
DELETE
FROM fact_posts
WHERE rowid NOT IN (
SELECT MIN(rowid) FROM fact_posts
GROUP BY post_id);
CREATE UNIQUE INDEX post_id ON fact_posts (post_id)
DELETE
FROM fact_posts_attachments
WHERE rowid NOT IN (
SELECT MIN(rowid) FROM fact_posts_attachments
GROUP BY object_id, attachment_target_url);
CREATE UNIQUE INDEX attachment_id ON
fact_posts_attachments (object_id, attachment_target_url)
DELETE
FROM fact_posts_comments
WHERE rowid NOT IN (
SELECT MIN(rowid) FROM fact_posts_comments
GROUP BY comment_id);
CREATE UNIQUE INDEX comment_id ON fact_posts_comments (comment_id)
DELETE
FROM fact_posts_comments_comments
WHERE rowid NOT IN (
SELECT MIN(rowid) FROM fact_posts_comments_comments
GROUP BY comment_id);
CREATE UNIQUE INDEX comment_comment_id ON
fact_posts_comments_comments (comment_id)
DELETE
FROM fact_posts_reactions
WHERE rowid NOT IN (
SELECT MIN(rowid) FROM fact_posts_reactions
GROUP BY object_id, reactor_id);
CREATE UNIQUE INDEX reaction_id ON
fact_posts_reactions (object_id, reactor_id)
DELETE
FROM fact_posts_comments_likes
WHERE rowid NOT IN (
SELECT MIN(rowid) FROM fact_posts_comments_likes
GROUP BY object_id, liker_id);
CREATE UNIQUE INDEX comment_like_id ON
fact_posts_comments_likes (object_id, liker_id)
DELETE
FROM fact_posts_comments_comments_likes
WHERE rowid NOT IN (
SELECT MIN(rowid) FROM fact_posts_comments_comments_likes
GROUP BY object_id, liker_id);
CREATE UNIQUE INDEX comment_comment_like_id ON
fact_posts_comments_comments_likes (object_id, liker_id)
We factor out users from tables with a user element to preserve space.
CREATE TABLE dim_user AS
WITH users_aggregated AS (
SELECT liker_id as user_id,
liker_name as user_name
FROM fact_posts_comments_comments_likes
UNION
SELECT liker_id as user_id,
liker_name as user_name
FROM fact_posts_comments_likes
UNION
SELECT reactor_id as user_id,
reactor_name as user_name
FROM fact_posts_reactions
UNION
SELECT commenter_id as user_id,
commenter_name as user_name
FROM fact_posts_comments_comments
UNION
SELECT commenter_id as user_id,
commenter_name as user_name
FROM fact_posts_comments
)
SELECT DISTINCT
user_id,
user_name
FROM users_aggregated
SELECT COUNT(DISTINCT user_id) as num_users FROM dim_user
CREATE TABLE dim_user_unique AS
SELECT *
FROM dim_user
WHERE rowid IN (
SELECT MAX(rowid) FROM dim_user
GROUP BY user_id);
CREATE INDEX user_id ON dim_user (user_id)
CREATE UNIQUE INDEX user_unique_id ON dim_user_unique (user_id)
ANALYZE
After extracting all Facebook posts, reactions, and comments for 2016 in these news pages, we come up with a finalized table schema:
dim_user
- contains all users and their names, duplicate rows are for those that changed their names.dim_user_unique
- contains all users and the latest name they have used, unique rows for each user.fact_posts
- contains information about all posts made by the news pages, excluding posts by others.fact_posts_comments
- contains all top-level comments made on postsfact_posts_reactions
- contains all reactions to the postsfact_posts_comments_likes
- contains all comment likes on top-level commentsfact_posts_comments_comments
- contains all comment replies (comments to comments)fact_posts_comments_comments_likes
- contains all likes on comment replies