Background

In order to be able to analyze news articles on Facebook, we need a method to extract the necessary information on posts, comments, comment replies and reactions thereto for a specific Facebook page. We use calls to the Facebook Graph API to retrieve this information.

Preliminaries

Libraries

We load the libraries necessary to perform the extraction:

suppressPackageStartupMessages({
  library(httr)      # Making API calls
  library(jsonlite)  # JSON parsing
  library(rvest)     # Page scraping
  
  library(magrittr)  # Advanced piping
  library(purrr)     # Functional programming
  
  library(stringr)   # String manipulation
  library(dplyr)     # Data frame manipulation
  library(tibble)    # Better data frames
  library(tidyr)     # Frame shaping
  library(lubridate) # Timestamp manipulation
  
  library(htmlwidgets) # Javascript bindings
  library(DT)          # DataTables JS Library
  
  library(pushoverr)   # Push notifications
})

# Set proper working directory
if(!str_detect(getwd(), "src")) setwd("src")

knitr::opts_chunk$set(eval = FALSE)

Scraping

Functions

We set up some scraping functions to help use extract information from a Facebook Page.

source("../func/01-page-scraping-functions.R")

Execution

Sample Selection

In order to prevent bias in selecting the pages to be scraped, we rely on Socialbaker’s Ranking of Top 100 Media Facebook Pages in the Philippines by Total Likes.

map_df(
  c(
    "../dta/01-top-ph-media-fbpages-50.html",
    "../dta/02-top-ph-media-fbpages-100.html"
  ),
  function(html) {
    read_html(x = html, encoding = "UTF-8") %>% 
      html_node(".brand-table-list") %>% 
      html_table(fill = TRUE) %>% 
      slice(1:50) %>% 
      select(
        rank       = X1, 
        page_title = X2,
        page_likes = X3
      ) %>% 
      mutate(
        page_title = page_title %>% 
          str_replace_all("^.*\n|\t", ""),
        page_likes = page_likes %>% 
          str_replace_all("Total Fans|\n|\t|\\s", "") %>% 
          as.integer()
      ) %>% 
      mutate(
        page_id = 
          read_html(x = html, encoding = "UTF-8") %>% 
          html_nodes(".name .acc-placeholder-img") %>% 
          map_chr(html_attr, "href") %>% 
          str_replace_all("^.*detail/", "") %>% 
          str_extract("^[0123456789]*-") %>% 
          str_replace_all("-", "")
      ) %>% 
      mutate(
        page_name = 
          map_chr(
            page_id,
            function(page_id) {
              message(page_id)
              GET(
                paste0("https://www.facebook.com/", page_id),
                user_agent("Mozilla/5.0")
              ) %$% url %>% 
                str_extract("facebook.com/.*/$") %>% 
                str_replace_all("facebook.com/|/", "")
            }
          )
      )
  }
) -> top_100_fbpages.dt

# Manual correction for page that changed name
top_100_fbpages.dt$page_name[
  top_100_fbpages.dt$page_name == "News5Features"
  ] <- "news5aksyon"

# Cache out
saveRDS(top_100_fbpages.dt, "../dta/04-news-pages.rds")
datatable(top_100_fbpages.dt)

We want to isolate the pages that are classified as news sites, which unfortunately can only be done manually.

newspages.ls <-
  top_100_fbpages.dt$page_name[
    c(2, 4, 15, 17, 19, 22, 24, 25, 28, 30, 33, 36,
      39, 40, 44, 47, 50, 55, 69, 73, 81, 87)]

print(newspages.ls)

Data Extraction

In order to start collecting information, we authenticate with a Facebook App Access Token Since page posts are public, we do not need a user access token to gather the posts and other information.

# Load authentication files
load("../bin/fb_auth.rda")

# Get authentication
getAppAccessToken(
  fb_app_id = fb_app_id,
  fb_app_secret = fb_app_secret
)

We then gather all the data.

# Download all new

try({
  walk(
    newspages.ls,
    function(newspage) {
      # only run if file does not exist
      if (
        !file.exists(paste0("../dta/03-fbpage-", newspage, ".rds")) &
        !file.exists(paste0("../dta/03-fbpageDONE-", newspage, ".rds"))
      ) {
        posts.ls <-
          cachedGetFBPage(
            page_name                   = newspage,
            limit_timestamp             = "2016-01-01 00:00:00",
            timezone                    = "Asia/Manila",
            cache_interval              = "1 day",
            posts_per_page              = 5,
            num_comments                = 100, 
            num_attachments             = 100,
            num_reactions               = 2000,  
            num_comments_likes          = 2000, 
            num_comments_comments       = 25,
            num_comments_comments_likes = 2000,
            access_token                = fb.tkn
          )
        
        saveRDS(
          object = posts.ls, 
          file   = paste0("../dta/03-fbpage-", newspage, ".rds")
        )
        
        rm(posts.ls)
      }
    }
  )
})

pushNotification()

Storage

Now that we have collected the information, we create a SQLite database so we can process it outside of memory.

# Create database
fbpages.sqlite <- 
  src_sqlite(
    path = "../dta/03-fbpages.sqlite",
    create = TRUE
  )

fbpages.sqlite.con <- fbpages.sqlite$con

We then parse through each of the data files and load them into memory.

# Get all cache files
fbpage_data.ls <-
  list.files("../dta", full.names = TRUE) %>% {
    .[str_detect(., "/03-fbpage-.*\\.rds")]
  }

# Initialize variables
posts.dt                         <- data_frame()
posts_attachments.dt             <- data_frame()
posts_reactions.dt               <- data_frame()
posts_comments.dt                <- data_frame()
posts_comments_likes.dt          <- data_frame()
posts_comments_comments.dt       <- data_frame()
posts_comments_comments_likes.dt <- data_frame()

# Walk through each cache file
walk(
  fbpage_data.ls,
  function(data_path) {
    # Read in data
    data <- readRDS(data_path)
    
    # Combine data frames in each shard
    walk(
      1:length(data),
      function(i) {
        # Lood shard
        message("Shard ", i, ": ", appendLF = FALSE)
        shard <- data[[i]]
        
        # Walk through each element of shard
        walk(
          c(
            "posts", "posts_attachments", "posts_reactions",
            "posts_comments", "posts_comments_likes", 
            "posts_comments_comments",
            "posts_comments_comments_likes"
          ),
          function(name) {
            # If element exists, write into appropriate data frame
            if (is.data.frame(shard[[name]])) {
              message(name, " ", appendLF = FALSE)
              fbpages.sqlite.con %>% 
                db_insert_into(
                  table  = paste0("fact_", name),
                  values = shard[[name]],
                  append = TRUE
                )
            }
          }
        )
        
        # Remove shard from memory
        rm(shard)
        message("Done!")
      }
    )

    # Remove data from memory
    rm(data)
    message("Done processing ", data_path, "!")
  }
)

pushNotification("SQL storage complete")

Data Cleaning

We clean the data by:

  • Deduplicating duplicate posts caused by pagination
  • Factoring out users who commented, liked, or reacted into a dim_user table, and dim_user_unique table
  • Optimizing the SQLite table by adding indices and performing an ANALYZE
Posts
DELETE 
FROM    fact_posts 
WHERE   rowid NOT IN (
        SELECT MIN(rowid) FROM fact_posts 
        GROUP BY post_id);
CREATE UNIQUE INDEX post_id ON fact_posts (post_id)
Post Attachments
DELETE
FROM    fact_posts_attachments
WHERE   rowid NOT IN (
        SELECT MIN(rowid) FROM fact_posts_attachments 
        GROUP BY object_id, attachment_target_url);
CREATE UNIQUE INDEX attachment_id ON 
  fact_posts_attachments (object_id, attachment_target_url)
Post Comments
DELETE
FROM    fact_posts_comments
WHERE   rowid NOT IN (
        SELECT MIN(rowid) FROM fact_posts_comments 
        GROUP BY comment_id);
CREATE UNIQUE INDEX comment_id ON fact_posts_comments (comment_id)
Post Comment Replies
DELETE
FROM    fact_posts_comments_comments
WHERE   rowid NOT IN (
        SELECT MIN(rowid) FROM fact_posts_comments_comments
        GROUP BY comment_id);
CREATE UNIQUE INDEX comment_comment_id ON 
  fact_posts_comments_comments (comment_id)
Post Reactions
DELETE
FROM    fact_posts_reactions
WHERE   rowid NOT IN (
        SELECT MIN(rowid) FROM fact_posts_reactions
        GROUP BY object_id, reactor_id);

CREATE UNIQUE INDEX reaction_id ON 
  fact_posts_reactions (object_id, reactor_id)
Post Comment Likes
DELETE
FROM    fact_posts_comments_likes
WHERE   rowid NOT IN (
        SELECT MIN(rowid) FROM fact_posts_comments_likes
        GROUP BY object_id, liker_id);
CREATE UNIQUE INDEX comment_like_id ON 
  fact_posts_comments_likes (object_id, liker_id)
Post Comment Reply Likes
DELETE
FROM    fact_posts_comments_comments_likes
WHERE   rowid NOT IN (
        SELECT MIN(rowid) FROM fact_posts_comments_comments_likes
        GROUP BY object_id, liker_id);
CREATE UNIQUE INDEX comment_comment_like_id ON 
  fact_posts_comments_comments_likes (object_id, liker_id)
Users

We factor out users from tables with a user element to preserve space.


CREATE TABLE dim_user AS 

WITH users_aggregated AS (

  SELECT    liker_id as user_id, 
            liker_name as user_name
  FROM      fact_posts_comments_comments_likes
  
  UNION
  
  SELECT    liker_id as user_id, 
            liker_name as user_name
  FROM      fact_posts_comments_likes
  
  UNION
  
  SELECT    reactor_id as user_id, 
            reactor_name as user_name
  FROM      fact_posts_reactions
  
  UNION
  
  SELECT    commenter_id as user_id, 
            commenter_name as user_name
  FROM      fact_posts_comments_comments
  
  UNION
  
  SELECT    commenter_id as user_id, 
            commenter_name as user_name
  FROM      fact_posts_comments
  
)

SELECT    DISTINCT 
          user_id,
          user_name
FROM      users_aggregated
SELECT COUNT(DISTINCT user_id) as num_users FROM dim_user
CREATE TABLE dim_user_unique AS 
SELECT  *
FROM    dim_user
WHERE   rowid IN (
        SELECT MAX(rowid) FROM dim_user
        GROUP BY user_id);
CREATE INDEX user_id ON dim_user (user_id)
CREATE UNIQUE INDEX user_unique_id ON dim_user_unique (user_id)
Optimizations
ANALYZE

Final Table Schema

After extracting all Facebook posts, reactions, and comments for 2016 in these news pages, we come up with a finalized table schema:

  • dim_user - contains all users and their names, duplicate rows are for those that changed their names.
  • dim_user_unique - contains all users and the latest name they have used, unique rows for each user.
  • fact_posts - contains information about all posts made by the news pages, excluding posts by others.
  • fact_posts_comments - contains all top-level comments made on posts
  • fact_posts_reactions - contains all reactions to the posts
  • fact_posts_comments_likes - contains all comment likes on top-level comments
  • fact_posts_comments_comments - contains all comment replies (comments to comments)
  • fact_posts_comments_comments_likes - contains all likes on comment replies