setwd("~/projects/momentum") # where the files are available
file.size("crspdaily.dta") |>
prettyunits::pretty_bytes()
[1] "13.35 GB"
file.size("crspmonthly.dta") |>
prettyunits::pretty_bytes()
[1] "753.24 MB"
Readings and class materials for Tuesday, November 21, 2023
Databases are the most common way for data storage, characterized by their optimization for expedient data retrieval and efficient management of substantial data sets.
SQL queries are the predominant method for accessing databases. If one is seeking job opportunities in the field of data analysis, proficiency in SQL is consistently expected. However, it is worth noting that accomplishing these tasks is also possible using R (with some limitation)!
Let’s consider the following case. We have a daily data about all the historicly traded stocks in the US. The data is stored in a file called crspdaily.dta
(the commonly used format for STATA) and it is 12 GB in size. We want to load the data into R and do some analysis, but our RAM is not sufficient or we want to avoid the efficiency loss that this file would cause.
We can load a dta
file with the haven package, but we will need to load the entire file into memory. This is not a problem if the file is small, but it can be a problem if the file is large. The trick is that we selectively load specific rows of data, avoiding to load the 13 GB into the memory.
setwd("~/projects/momentum") # where the files are available
file.size("crspdaily.dta") |>
prettyunits::pretty_bytes()
[1] "13.35 GB"
file.size("crspmonthly.dta") |>
prettyunits::pretty_bytes()
[1] "753.24 MB"
We will use the RSQLite
package to create a database. The dbConnect()
function creates a connection to the database. The dbWriteTable()
function writes a data frame to the database. The dbDisconnect()
function closes the connection to the database. You can access it with the tbl
function, but…
library(RSQLite)
# Connect to a new SQLite database (this will create the file)
db <- dbConnect(RSQLite::SQLite(), dbname = "database.sqlite")
dbWriteTable(db, "iris_test", iris, overwrite = TRUE)
tbl(db, "iris_test")
# Source: table<iris_test> [?? x 5]
# Database: sqlite 3.43.2 [/Users/marci/projects/bigdata2023/content/database.sqlite]
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <chr>
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
7 4.6 3.4 1.4 0.3 setosa
8 5 3.4 1.5 0.2 setosa
9 4.4 2.9 1.4 0.2 setosa
10 4.9 3.1 1.5 0.1 setosa
# ℹ more rows
During this time, the data is not being read. With this, we are only preparing a query, but the data itself is not loaded. Therefore, we can see that the table has ??
number of rows. The data will only be loaded when the collect
function is called.
tbl(db, "iris_test") |>
collect()
# A tibble: 150 × 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <chr>
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
7 4.6 3.4 1.4 0.3 setosa
8 5 3.4 1.5 0.2 setosa
9 4.4 2.9 1.4 0.2 setosa
10 4.9 3.1 1.5 0.1 setosa
# ℹ 140 more rows
This is important because our dplyr
functions are translated into R SQL queries using the dbplyr
package, and we only receive the result of the query. Therefore, if we know which rows and columns (or aggregates) of a large gigabyte file we need, we do not need to load the entire dataset.
To expand an existing data table with additional observations, we can utilize the append = TRUE
argument of the dbWriteTable()
function.
dbWriteTable(db, "iris_test", iris, append = TRUE)
tbl(db, "iris_test") |>
collect()
# A tibble: 300 × 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <chr>
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
7 4.6 3.4 1.4 0.3 setosa
8 5 3.4 1.5 0.2 setosa
9 4.4 2.9 1.4 0.2 setosa
10 4.9 3.1 1.5 0.1 setosa
# ℹ 290 more rows
db <- dbConnect(RSQLite::SQLite(), dbname = "crsp.sqlite")
dta_to_db <- function(db = db, path_to_dta, table_name, n_max = 1e5) {
n <- 0
cli::cli_status("Read lines: {n}")
while (TRUE) {
# read only a part of the data
res_df <- haven::read_dta(path_to_dta, n_max = n_max, skip = n)
# write to db
if (n == 0) {
dbWriteTable(db, table_name, res_df, overwrite = TRUE)
} else {
dbWriteTable(db, table_name, res_df, append = TRUE)
}
n <- n + n_max
# print status
cli::cli_status("Read lines: {.s {n}}")
if (nrow(res_df) < n_max) break
}
cli::cli_alert_success("Transformed!")
}
dta_to_db(db, "~/projects/momentum/crspdaily.dta", "crspdaily")
dta_to_db(db, "~/projects/momentum/crspmonthly.dta", "crspmonthly")
RSQLite::dbListTables(db)
[1] "crspdaily" "crspmonthly"
tbl(db, "crspdaily")
# Source: table<crspdaily> [?? x 17]
# Database: sqlite 3.43.2 [/Users/marci/crsp/crsp.sqlite]
permno date shrcd siccd cusip divamt facpr dlret prc vol ret bid
<dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 10000 5849 NA NA 683916… NA NA NA NA NA NA NA
2 10000 5850 10 3990 683916… NA NA NA -2.56 1000 NA NA
3 10000 5851 10 3990 683916… NA NA NA -2.5 12800 -0.0244 NA
4 10000 5852 10 3990 683916… NA NA NA -2.5 1400 0 NA
5 10000 5853 10 3990 683916… NA NA NA -2.5 8500 0 NA
6 10000 5856 10 3990 683916… NA NA NA -2.62 5450 0.0500 NA
7 10000 5857 10 3990 683916… NA NA NA -2.75 2075 0.0476 NA
8 10000 5858 10 3990 683916… NA NA NA -2.88 22490 0.0455 NA
9 10000 5859 10 3990 683916… NA NA NA -3 10900 0.0435 NA
10 10000 5860 10 3990 683916… NA NA NA -3 8470 0 NA
# ℹ more rows
# ℹ 5 more variables: ask <dbl>, shrout <dbl>, openprc <dbl>, numtrd <dbl>,
# vwretd <dbl>
tbl(db, "crspmonthly")
# Source: table<crspmonthly> [?? x 21]
# Database: sqlite 3.43.2 [/Users/marci/crsp/crsp.sqlite]
PERMNO date SHRCD SICCD TICKER SHRCLS PRIMEXCH HEXCD CUSIP DIVAMT FACPR
<dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl> <chr> <dbl> <dbl>
1 10000 5843 NA NA "" "" "" 3 68391610 NA NA
2 10000 5874 10 3990 "OMFGA" "A" "Q" 3 68391610 NA NA
3 10000 5902 10 3990 "OMFGA" "A" "Q" 3 68391610 NA NA
4 10000 5933 10 3990 "OMFGA" "A" "Q" 3 68391610 NA NA
5 10000 5963 10 3990 "OMFGA" "A" "Q" 3 68391610 NA NA
6 10000 5993 10 3990 "OMFGA" "A" "Q" 3 68391610 NA NA
7 10000 6024 10 3990 "OMFGA" "A" "Q" 3 68391610 NA NA
8 10000 6055 10 3990 "OMFGA" "A" "Q" 3 68391610 NA NA
9 10000 6084 10 3990 "OMFGA" "A" "Q" 3 68391610 NA NA
10 10000 6116 10 3990 "OMFGA" "A" "Q" 3 68391610 NA NA
# ℹ more rows
# ℹ 10 more variables: DLPRC <dbl>, DLRET <dbl>, PRC <dbl>, VOL <dbl>,
# RET <dbl>, BID <dbl>, ASK <dbl>, SHROUT <dbl>, SPREAD <dbl>, vwretd <dbl>
permno_to_ticker <- tbl(db, "crspmonthly") |>
# stocks at large US markets
filter(SHRCD %in% 10:11, HEXCD == 3, TICKER != "") |>
select(PERMNO, TICKER) |>
distinct()
# ! date format is coerced to numeric in the db
date_filter <- as.integer(as.Date("2018-12-31") - as.Date("1970-01-01"))
crsp_df <- tbl(db, "crspdaily") |>
filter(date > date_filter) |> # after 2018
inner_join(x = permno_to_ticker, by = c("PERMNO" = "permno")) |>
collect()
dbDisconnect(db)
crsp_df
# A tibble: 2,748,416 × 18
PERMNO TICKER date shrcd siccd cusip divamt facpr dlret prc vol
<dbl> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 10026 JJSF 17898 11 2052 46603210 NA NA NA 141 112825
2 10026 JJSF 17899 11 2052 46603210 NA NA NA 143. 84331
3 10026 JJSF 17900 11 2052 46603210 NA NA NA 145. 88233
4 10026 JJSF 17903 11 2052 46603210 NA NA NA 145. 79539
5 10026 JJSF 17904 11 2052 46603210 NA NA NA 149. 70200
6 10026 JJSF 17905 11 2052 46603210 NA NA NA 148. 63591
7 10026 JJSF 17906 11 2052 46603210 NA NA NA 149. 66117
8 10026 JJSF 17907 11 2052 46603210 NA NA NA 149. 58050
9 10026 JJSF 17910 11 2052 46603210 NA NA NA 148. 92311
10 10026 JJSF 17911 11 2052 46603210 NA NA NA 146 89393
# ℹ 2,748,406 more rows
# ℹ 7 more variables: ret <dbl>, bid <dbl>, ask <dbl>, shrout <dbl>,
# openprc <dbl>, numtrd <dbl>, vwretd <dbl>
t <- tempfile()
download.file("https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/Siccodes17.zip", t)
unzip(t)
industry_raw <- read_file(t)
industry_classification_df <- industry_raw |>
str_remove_all("\r") |>
str_split_1("\n") |>
enframe(value = "raw", name = NULL) |>
filter(raw != "") |>
mutate(
# `parent`: boolean whether or not the row represents a parent industry code
parent = str_starts(raw, "[ ]{0,2}\\d"),
industry17 = ifelse(parent, str_remove(raw, "\\d{1,3} \\w{1,}"), NA),
industry17 = str_squish(industry17),
code = map2(parent, raw, \(p, r) {
if (!p) {
# numbers from the first to the second code extracted from `raw`
# "0100-0199"
seq(
from = as.numeric(str_extract_all(r, "\\d{4}")[[1]][1]),
to = as.numeric(str_extract_all(r, "\\d{4}")[[1]][2])
)
}
}),
industry = str_remove(raw, ".*\\d"),
industry = str_squish(industry)
) |>
select(contains("industry") | contains("code")) |>
fill(industry17) |>
filter(duplicated(industry17)) |>
unnest(code) |>
mutate(code = ifelse(code < 1000, str_c("0", code), code)) |>
select(code, industry17)
download.file("https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_Factors_daily_CSV.zip", t)
unzip(t)
kenneth_factors_df <- read_csv(t, skip = 3)
kenneth_factors_df
# A tibble: 25,587 × 5
...1 `Mkt-RF` SMB HML RF
<chr> <dbl> <dbl> <dbl> <dbl>
1 19260701 0.1 -0.25 -0.27 0.009
2 19260702 0.45 -0.33 -0.06 0.009
3 19260706 0.17 0.3 -0.39 0.009
4 19260707 0.09 -0.58 0.02 0.009
5 19260708 0.21 -0.38 0.19 0.009
6 19260709 -0.71 0.43 0.57 0.009
7 19260710 0.62 -0.53 -0.1 0.009
8 19260712 0.04 -0.03 0.64 0.009
9 19260713 0.48 -0.28 -0.2 0.009
10 19260714 0.04 0.07 -0.43 0.009
# ℹ 25,577 more rows