Crawling the Annual Reports (i.e., 10-Ks) using R

Terminal and tmux:

Using a remote server for the parsing process.

ssh <account_ID>@<Server_IP_Address>  ## Connect to a server
tmux attach -t <Session Number>  ## Attach a tmux session
R  ## Access R

R Session:

Loading relevant packages to be used in the process.

library(colorout)
library(data.table)
library(dplyr)
library(stringi)
library(TenK)
library(rvest)
library(rjson)
library(stringr)
library(ggplot2)
library(ggtheme)
library(DescTools)

Collecting the SIC codes

sicLIST <- read_html("https://www.sec.gov/info/edgar/siccodes.htm")
SICs <- sicLIST %>%
  html_nodes("td") %>%
  html_text()
SICs <- SICs[19:length(SICs)]
SICs <- SICs[!SICs %in% c("", " ")]
SICs <- matrix(SICs, ncol = 4, byrow = T)
SICs <- SICs[-nrow(SICs),] ## SIC Table
SICs <- SICs[-1,1] ## Removing all but the SIC codes

Collecting the CIK (i.e., EDGAR company identifier) codes

i = 1
j = 1
cikVEC <- list()
pages <- seq(from = 1, to = 10000000, by = 100)
for(j in 1:length(SICs)){
  for(i in 1:length(pages)){
    sec <- read_html(paste0("https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=",
    SICs[j], "&owner=include&match=&start=", pages[i], "&count=100&hidefilings=0"))
    cik <- sec %>%
      html_nodes("td") %>%
      html_text() %>%
      strsplit("\n") %>%
      unlist() %>%
      stri_trim_both()
    if(length(cik) != 0){
      #cik <- matrix(cik[-length(cik)], ncol = 3, byrow = T)
      #cik <- cik[-1,1]
      cik <- cik[substr(cik,1,2) == "00"]
      cikVEC[[i]] <- cik
      print(paste0("j = ",j," & i = ",i , " iteration completed."))
    }
    else{
      print(paste0("End of Loop for j = ", j))
      break
    }
  }
} ## Constructing a full CIK (EDGAR company identifier) list
rm(list = ls()[!(ls() %in% c('cikVEC'))])
cikVEC <- do.call("rbind", cikVEC)

Collecting the URL addresses containing 10-K forms

k = 1
tenkVEC <- list()
formList <- c("10-K", "10-K405", "10KSB", "10KSB40")  ## 10-K forms (including 10-K405, 10KSB, and 10KSB40 forms)

time = Sys.time()
for(k in 1:length(cikVec)){
  TenKs <- html_session(paste0("https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=", cikVec[k],"&type=10-k&dateb=&owner=exclude&count=100"))
  tenk <- NULL
  tenkUrl <- NULL
  try(tenkUrl <- TenKs %>%
        html_nodes("tr td a#documentsbutton") %>%
        html_attr("href"))
  
  documentUrl <- NULL
  if(!identical(tenkUrl,character(0))){
    for(u in tenkUrl){
      docUrl <- paste0("https://www.sec.gov",u) %>%
        html_session() %>%
        html_nodes("tr td a") %>%
        html_attr("href")
      documentUrl <- append(documentUrl, docUrl)
    }
    
    type <- NULL
    for(u in tenkUrl){
      typ <- paste0("https://www.sec.gov",u) %>%
        html_session() %>%
        html_nodes("td") %>%
        html_text() %>%
        matrix(ncol = 5, byrow = T)
      type <- rbind(type, typ)
    }
    
    tenk <- data.table(cbind(type, documentUrl))
    # tenk <- tenk[V2 == "Complete submission text file" | V4 %in% formList, documentUrl]
    tenk <- tenk[V4 %in% formList, documentUrl]
    tenkVEC[[k]] <- tenk
  }
  print(paste0(k, " iteration completed."))
}
print("End of Loop.")
difTime <- Sys.time() - time

tenkVEC <- data.table(unlist(tenkVEC))
tenkVEC2 = tenkVEC[, .(url = paste0("https://www.sec.gov",V1))]

Collecting the item 1 sections (i.e., business descriptions) of the annual reports

i = 1
error_list = NULL
tenK_item1 = vector("list")

time = Sys.time()
for(i in 1:length(tenkVEC2[,url])){
  res = NULL
  tryCatch(res <- TenK_process(URL = tenkVEC2[i,url], retrieve = "BD"),
           error = function(e) error_list <<- append(error_list, i))  ## length(error_list) = 2,381 --> XBRL Viewer
  tenK_item1[[i]] <- res
  print(paste0("#", i, " iteration completed."))
}
print("End of Loop")
Sys.time() - time

Saving workspace

save.image("./XXX.RData")

End of Code

Created on Jun. 3, 2017



Enjoy Reading This Article?

Here are some more articles you might like to read next:

  • External Post from diddigest.substack.com
  • Multi-Armed Bandit (MAB) as an Alternative to A/B Testing: a Simulation using R
  • Scraping the Online Job Posting Data: 'Indeed.com'
  • Collection of Useful Packages, Cheat Sheets (R and Python), and Tips
  • Hand-Rolling OLS Using R