A whole game

Readings and class materials for Tuesday, September 26, 2023

date_to_colname <- function(.data) {
  # * wide panel format > header are the dates started at 2nd col from 2017
  dates <- seq.Date(
    from = as.Date("2007-01-01"),
    by = "days", 
    to = Sys.Date()
  ) |> 
    keep(~ lubridate::wday(., week_start = 1) %in% 1:5) |> 
    head(ncol(.data) - 1) |> 
    as.character()
  
  .data |> 
    set_names(c("ticker", dates))
}
bloomberg_raw <- list.files("../data/bloomberg", full.names = TRUE) |> 
  keep(str_detect, "/bloomberg_scores\\d{1,2}.xlsx") |> 
  map(.progress = "reading raw data", \(x) {
    list(
      news_heat = readxl::read_xlsx(x, sheet = 1, progress = FALSE) |> 
        date_to_colname(),
      sentiment_avg = readxl::read_xlsx(x, sheet = 2, progress = FALSE) |> 
        date_to_colname()
    )
  })
reading raw data ■■■■                              10% |  ETA:  1m
New names:
New names:
reading raw data ■■■■■■■ 20% | ETA: 1m
reading raw data ■■■■■■■■■■ 30% | ETA: 1m
reading raw data ■■■■■■■■■■■■■ 40% | ETA: 1m
reading raw data ■■■■■■■■■■■■■■■■ 50% | ETA: 46s
reading raw data ■■■■■■■■■■■■■■■■■■■ 60% | ETA: 37s
reading raw data ■■■■■■■■■■■■■■■■■■■■■■ 70% | ETA: 27s
reading raw data ■■■■■■■■■■■■■■■■■■■■■■■■■ 80% | ETA: 18s
reading raw data ■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 90% | ETA: 9s
• `` -> `...4274`
• `` -> `...4275`
• `` -> `...4276`
• `` -> `...4277`
• `` -> `...4278`
• `` -> `...4279`
• `` -> `...4280`
• `` -> `...4281`
• `` -> `...4282`
• `` -> `...4283`
• `` -> `...4284`
• `` -> `...4285`
• `` -> `...4286`
• `` -> `...4287`
• `` -> `...4288`
• `` -> `...4289`
• `` -> `...4290`
• `` -> `...4291`
• `` -> `...4292`
news_heat_df <- bloomberg_raw |> 
  map_dfr(\(x) {
    x$news_heat |> 
      pivot_longer(-1, 
                   names_to = "time",
                   names_transform = ymd, 
                   values_to = "news_heat") |> 
      mutate(
        ticker = str_remove(ticker, " .*"),
        news_heat = factor(news_heat, levels = 0:4, ordered = TRUE)
      )
  }) |> 
  drop_na()

news_heat_df
# A tibble: 22,129,018 × 3
   ticker time       news_heat
   <chr>  <date>     <ord>    
 1 AAPL   2010-02-16 0        
 2 AAPL   2010-02-17 2        
 3 AAPL   2010-02-18 1        
 4 AAPL   2010-02-19 0        
 5 AAPL   2010-02-22 2        
 6 AAPL   2010-02-23 3        
 7 AAPL   2010-02-24 2        
 8 AAPL   2010-02-25 4        
 9 AAPL   2010-02-26 3        
10 AAPL   2010-03-01 4        
# ℹ 22,129,008 more rows
sentiment_avg_df <- bloomberg_raw |> 
  map_dfr(\(x) {
    x$sentiment_avg |> 
      pivot_longer(-1, 
                   names_to = "time",
                   names_transform = ymd, 
                   values_to = "sentiment_avg") |> 
      mutate(
        ticker = str_remove(ticker, " .*"),
        sentiment_avg = as.numeric(sentiment_avg)
      )
  }) |> 
  drop_na()

sentiment_avg_df
# A tibble: 31,270,528 × 3
   ticker time       sentiment_avg
   <chr>  <date>             <dbl>
 1 AMZN   2007-01-04       -0.500 
 2 AMZN   2007-01-05       -0.500 
 3 AMZN   2007-01-08       -0.500 
 4 AMZN   2007-01-09       -0.500 
 5 AMZN   2007-01-10        0.0555
 6 AMZN   2007-01-11        0.0555
 7 AMZN   2007-01-12        0.0555
 8 AMZN   2007-01-15        0.0555
 9 AMZN   2007-01-16        0.0555
10 AMZN   2007-01-17        0.0555
# ℹ 31,270,518 more rows
bloomberg_df <- list(
  news_heat_df, 
  sentiment_avg_df
) |> 
  reduce(full_join, by = join_by(ticker, time)) |> 
  arrange(ticker, time)

bloomberg_df
# A tibble: 34,499,579 × 4
   ticker time       news_heat sentiment_avg
   <chr>  <date>     <ord>             <dbl>
 1 A      2007-01-05 <NA>             0.0506
 2 A      2007-01-08 <NA>             0.0506
 3 A      2007-01-09 <NA>             0.0506
 4 A      2007-01-10 <NA>             0.0506
 5 A      2007-01-11 <NA>             0.0506
 6 A      2007-01-12 <NA>             0     
 7 A      2007-01-15 <NA>             0     
 8 A      2007-01-16 <NA>             0     
 9 A      2007-01-17 <NA>             0.558 
10 A      2007-01-18 <NA>             0.319 
# ℹ 34,499,569 more rows