dataset_generator.R

library(tidyverse)
library(RColorBrewer)
library(sf)
library(raster)
library(readr)
library(maps)
library(maptools)
library(sp)
library(leaflet)
library(tigris)
library(stringr)
library(fec12)
library(shiny)
library(htmltools)
library(shinybusy)
library(foreign)

# Load the names and abbreviations of all states ==========
# # TODO: Don't do this every time!! Read in a dataframe
# state_choices <- data.frame(state.name, state.abb)                        # Initialize with standard state names and abb
# names(state_choices) <- c('State', 'Abbreviation')                        # Better column headers
# # Format state names
# state_choices$Adjusted_Name <- tolower(state_choices$State)               # Lower case version of state name
# state_choices$Adjusted_Name <- sub(' ', '_', state_choices$Adjusted_Name) # ...without spaces
#
# saveRDS(state_choices,"state_choices.Rds")
state_choices <- readRDS("data/general/state_choices.Rds")

# Low / moderate income data ==========
# Read in national LMI data and Tracts2District data
national_lmi <- readRDS("data/general/national_lmi.Rds")
options(tigris_class = "sf") # shape file

#Tract2District <- readRDS('data/general/Tract2District.Rds')
Tract2District <- readRDS('data/general/Tract2District.remap.Rds') %>%
  rename(GEOID = GEOID_TRACT_10)

# Function
generate_lowmod_data <- function(state_code) {
  # State census block groups
  state_lmi <- national_lmi %>% filter(Stusab == state_code)
  # Reduce GEOID to census tract
  state_lmi$GEOID <- as.character(state_lmi$GEOID)
  state_lmi$GEOID <- str_sub(state_lmi$GEOID, 1, nchar(state_lmi$GEOID) - 1)
  # Make the low mod percentage of each census tract the average of all block groups within
  # that census tract
  state_lmi_averaged <- state_lmi %>%
    dplyr::select(GEOID, Lowmod_pct) %>%
    group_by(GEOID) %>%
    mutate(Lowmod_pct = median(Lowmod_pct)) %>%
    dplyr::distinct(GEOID, Lowmod_pct, .keep_all = TRUE)

  # Obtain census tract data from year 2015 to make it coherent with the year of LMI data
#  state_tract <- tracts(state = state_code, year = 2015) # Might have to keep this at 2015...
  state_tract <- tracts(state = state_code, year = 2015)

  # Merge and convert Lowmod_pct to range[0, 100]
  # This still looks good!
  state_lowmod <- merge(state_tract, state_lmi_averaged, by = 'GEOID') %>%
    dplyr::select(GEOID, Lowmod_pct) %>%
    mutate(Lowmod_pct = Lowmod_pct * 100)

  # Merge with Tract2District data
  # TODO: Create new version of Tract2District
  state_districts <- Tract2District %>%
    filter(STATE == state_code) %>%
    dplyr::select(GEOID, DISTRICT)

  # TODO: This merge definitely breaks it!!!
  merged <- merge(state_lowmod, state_districts, by="GEOID")
  merged$DISTRICT <- as.numeric(merged$DISTRICT) # Should this be numeric??

  # Take into account of states with single congressional district
  # Name the 'DISTRICT' column 'At-large'
  single_cd_states <- c('AK', 'DE', 'MT', 'ND', 'SD', 'VT', 'WY')
  if (state_code %in% single_cd_states) {
    merged$DISTRICT <- 'At-large'
  }

  # Name and save the dataset
  state_abbr <- state_choices[which(state_choices$Abbreviation == state_code), 3]
  file_path <- paste(state_abbr, '.Rds', sep = '')
  file_path <- paste('data/lowmod/', file_path, sep = '')
  saveRDS(merged, file_path)
}

# Generate lowmod data for individual states ==========
for (i in 1: nrow(state_choices)) {
  state_code <- state_choices[i, 2]
  generate_lowmod_data(state_code)
}

# Congressional district outline data ==========
# Format congressional district elections data
# TODO: New outlines!
# results_house %>%
#   group_by(state, district_id) %>%
#   summarize(N = n()) %>%
#   nrow()
#
# results_house %>%
#   left_join(candidates, by = "cand_id") %>%
#   dplyr::select(state, district_id, cand_name, party, general_votes) %>%
#   arrange(desc(general_votes))
#
# district_elections <- results_house %>%
#   dplyr::mutate(district = parse_number(district_id)) %>%
#   dplyr::mutate(district = as.character(district)) %>%
#   dplyr::group_by(state, district) %>%
#   dplyr::summarize(
#     N = n(),
#     total_votes = sum(general_votes, na.rm = TRUE),
#     d_votes = sum(ifelse(party == "D", general_votes, 0), na.rm = TRUE),
#     r_votes = sum(ifelse(party == "R", general_votes, 0), na.rm = TRUE)
#   ) %>%
#   dplyr::mutate(
#     other_votes = total_votes - d_votes - r_votes,
#     r_prop = r_votes / total_votes,
#     winner = ifelse(r_votes > d_votes, "Republican", "Democrat")
#   )
#
# district_elections$district <- stringr::str_pad(district_elections$district,width=2, side="left", pad="0")
#
# saveRDS(district_elections,"data/general/district_elections.Rds")

district_elections <- readRDS("data/general/district_elections.Rds")

# FIPS2STATENAME$STATEFP <- str_pad(FIPS2STATENAME$STATEFP, width=2, side="left", pad="0")
# saveRDS(FIPS2STATENAME, "data/general/FIPS2STATENAME.Rds")

# Read in congressional districts data
#districts <- readRDS('data/general/districts.Rds') # TODO: Use new shapes!
# districts_new <- congressional_districts(year=2020) # Replace with readRDS()

#districts <- readRDS('data/general/districts_new.Rds') # CD116 districts; don't use!
districts <- readRDS('data/cd118_shapes/districts_cd118.Rds') # Via https://simplemaps.com/data/congress

# districts <- districts %>%
#   dplyr::select(-c("STATENAME","Abbreviation")) %>%
#   left_join(FIPS2STATENAME, by="STATEFP") %>%
#   relocate(STATENAME, .after = STATEFP) %>%
#   left_join(STATENAME2ABB, by="STATENAME") %>%
#   relocate(Abbreviation, .after = STATENAME)

#saveRDS(districts, "data/general/districts_new.Rds")

# Function
generate_cong_districts <- function(state_code) {
  # State congressional district elections data
  state_elections <- district_elections %>%
    dplyr::filter(state == state_code)

  # State shape data
  state_name <- state_choices[which(state_choices$Abbreviation == state_code), 1]
  state_shape <- districts %>%
    dplyr::filter(STATENAME == state_name) %>%
    dplyr::mutate(ID = GEOID) %>%
    dplyr::select(STATENAME, ID, DISTRICT)

  # Merge
  state_merged <- state_shape %>%
    st_transform(4326) %>%  # Using CRS 4326
    inner_join(state_elections, by = c('DISTRICT' = 'district'))

  # Name and save the dataset
  state_abbr <- state_choices[which(state_choices$Abbreviation == state_code), 3]
  file_path <- paste(state_abbr, '.Rds', sep = '')
  file_path <- paste('data/cong_districts/', file_path, sep = '')
  saveRDS(state_merged, file_path)
}

# Generate congressional districts data for individual states ==========
for (i in 1: nrow(state_choices)) {
  state_code <- state_choices[i, 2]
  generate_cong_districts(state_code)
}


# Reservoir data ==========
# Read in reservoirs data
reservoirs <- readRDS("data/general/reservoirs.Rds")

# Function
generate_reservoirs <- function(state_code) {
  # Filter to state reservoirs
  state_reservoirs <- reservoirs %>%
    filter(state == state_code)

  # Read in state lowmod data for spatial merge
  state_abbr <- state_choices[which(state_choices$Abbreviation == state_code), 3]
  lowmod_path <- paste('data/lowmod/', state_abbr, sep = '')
  lowmod_path <- paste(lowmod_path, '.Rds', sep = '')
  state_lowmod_data <- readRDS(lowmod_path)

  # Convert reservoir data to shape file; transform coordinates
  state_reservoirs_spatial <- state_reservoirs %>%
    st_as_sf(coords=c('long', 'lat'), crs = '+proj=longlat +datum=WGS84') %>%
    st_transform(crs = st_crs(state_lowmod_data))

  # saveRDS(state_reservoirs_spatial, "state_reservoirs_spatial.Rds") # For debug

  # Spatial join reservoir data and lowmod data; filter columns
  # TODO: This is producing some NAs; some reservoir points not overlapping with census blocks
  points_in_state <- st_join(state_reservoirs_spatial, state_lowmod_data, left = T)
  points_in_state <- points_in_state[, c('dam_name', 'Lowmod_pct')]

  # saveRDS(points_in_state, "points_in_state.Rds") # for debug

  # Merge back with the original reservoir data
  state_reservoirs_merged <- base::merge(state_reservoirs, points_in_state, by = 'dam_name')

  # Add low mod percentage buckets
  state_reservoirs_merged <- state_reservoirs_merged %>% mutate(
    lowmod_bucket = case_when(
      Lowmod_pct < 20 ~ '0 - 20',
      Lowmod_pct >= 20 & Lowmod_pct < 40 ~ '20 - 40',
      Lowmod_pct >= 40 & Lowmod_pct < 60 ~ '40 - 60',
      Lowmod_pct >= 60 & Lowmod_pct < 80 ~ '60 - 80',
      Lowmod_pct >= 80 ~ '80 - 100',
      is.na(Lowmod_pct) ~ 'No LMI Data'
    )
  )

  # Name and save the dataset
  file_path <- paste(state_abbr, '.Rds', sep = '')
  file_path <- paste('data/reservoirs/', file_path, sep = '')
  saveRDS(state_reservoirs_merged, file_path)
}

# Generate reservoir data for individual states ==========
for (i in 1: nrow(state_choices)) {
  state_code <- state_choices[i, 2]
  generate_reservoirs(state_code)
}


# Electric substation data ==========
# Read in substation data
substations <- read.csv('data/general/electric_substations.csv')

# Function
generate_substations <- function(state_code) {
  # Filter to state substations
  state_substations <- substations %>%
    filter(STATE == state_code) %>%
    # get rid of useless columns
    dplyr::select(-c('NAICS_CODE', 'NAICS_DESC', 'COUNTRY', 'SOURCE',
                     'SOURCEDATE', 'VAL_METHOD', 'VAL_DATE', 'LINES',
                     'MAX_INFER', 'MIN_INFER', 'X', 'Y')) %>%
    # take out unavailable substations
    filter(STATUS != 'NOT AVAILABLE')

  # Name and save the dataset
  state_abbr <- state_choices[which(state_choices$Abbreviation == state_code), 3]
  file_path <- paste(state_abbr, '.Rds', sep = '')
  file_path <- paste('data/substations/', file_path, sep = '')
  saveRDS(state_substations, file_path)
}

# Generate substation data for individual states ==========
for (i in 1: nrow(state_choices)) {
  state_code <- state_choices[i, 2]
  generate_substations(state_code)
}


# Generate ALL DATA ==========
# Or run previous for-loops individually for each data type
for (i in 1: nrow(state_choices)) {
  state_code <- state_choices[i, 2]
  generate_lowmod_data(state_code)
  generate_cong_districts(state_code)
  generate_reservoirs(state_code)
}
	library(tidyverse)
	library(RColorBrewer)
	library(sf)
	library(raster)
	library(readr)
	library(maps)
	library(maptools)
	library(sp)
	library(leaflet)
	library(tigris)
	library(stringr)
	library(fec12)
	library(shiny)
	library(htmltools)
	library(shinybusy)
	library(foreign)

	# Load the names and abbreviations of all states ==========
	# # TODO: Don't do this every time!! Read in a dataframe
	# state_choices <- data.frame(state.name, state.abb) # Initialize with standard state names and abb
	# names(state_choices) <- c('State', 'Abbreviation') # Better column headers
	# # Format state names
	# state_choices$Adjusted_Name <- tolower(state_choices$State) # Lower case version of state name
	# state_choices$Adjusted_Name <- sub(' ', '_', state_choices$Adjusted_Name) # ...without spaces
	#
	# saveRDS(state_choices,"state_choices.Rds")
	state_choices <- readRDS("data/general/state_choices.Rds")

	# Low / moderate income data ==========
	# Read in national LMI data and Tracts2District data
	national_lmi <- readRDS("data/general/national_lmi.Rds")
	options(tigris_class = "sf") # shape file

	#Tract2District <- readRDS('data/general/Tract2District.Rds')
	Tract2District <- readRDS('data/general/Tract2District.remap.Rds') %>%
	rename(GEOID = GEOID_TRACT_10)

	# Function
	generate_lowmod_data <- function(state_code) {
	# State census block groups
	state_lmi <- national_lmi %>% filter(Stusab == state_code)
	# Reduce GEOID to census tract
	state_lmi$GEOID <- as.character(state_lmi$GEOID)
	state_lmi$GEOID <- str_sub(state_lmi$GEOID, 1, nchar(state_lmi$GEOID) - 1)
	# Make the low mod percentage of each census tract the average of all block groups within
	# that census tract
	state_lmi_averaged <- state_lmi %>%
	dplyr::select(GEOID, Lowmod_pct) %>%
	group_by(GEOID) %>%
	mutate(Lowmod_pct = median(Lowmod_pct)) %>%
	dplyr::distinct(GEOID, Lowmod_pct, .keep_all = TRUE)

	# Obtain census tract data from year 2015 to make it coherent with the year of LMI data
	# state_tract <- tracts(state = state_code, year = 2015) # Might have to keep this at 2015...
	state_tract <- tracts(state = state_code, year = 2015)

	# Merge and convert Lowmod_pct to range[0, 100]
	# This still looks good!
	state_lowmod <- merge(state_tract, state_lmi_averaged, by = 'GEOID') %>%
	dplyr::select(GEOID, Lowmod_pct) %>%
	mutate(Lowmod_pct = Lowmod_pct * 100)

	# Merge with Tract2District data
	# TODO: Create new version of Tract2District
	state_districts <- Tract2District %>%
	filter(STATE == state_code) %>%
	dplyr::select(GEOID, DISTRICT)

	# TODO: This merge definitely breaks it!!!
	merged <- merge(state_lowmod, state_districts, by="GEOID")
	merged$DISTRICT <- as.numeric(merged$DISTRICT) # Should this be numeric??

	# Take into account of states with single congressional district
	# Name the 'DISTRICT' column 'At-large'
	single_cd_states <- c('AK', 'DE', 'MT', 'ND', 'SD', 'VT', 'WY')
	if (state_code %in% single_cd_states) {
	merged$DISTRICT <- 'At-large'
	}

	# Name and save the dataset
	state_abbr <- state_choices[which(state_choices$Abbreviation == state_code), 3]
	file_path <- paste(state_abbr, '.Rds', sep = '')
	file_path <- paste('data/lowmod/', file_path, sep = '')
	saveRDS(merged, file_path)
	}

	# Generate lowmod data for individual states ==========
	for (i in 1: nrow(state_choices)) {
	state_code <- state_choices[i, 2]
	generate_lowmod_data(state_code)
	}

	# Congressional district outline data ==========
	# Format congressional district elections data
	# TODO: New outlines!
	# results_house %>%
	# group_by(state, district_id) %>%
	# summarize(N = n()) %>%
	# nrow()
	#
	# results_house %>%
	# left_join(candidates, by = "cand_id") %>%
	# dplyr::select(state, district_id, cand_name, party, general_votes) %>%
	# arrange(desc(general_votes))
	#
	# district_elections <- results_house %>%
	# dplyr::mutate(district = parse_number(district_id)) %>%
	# dplyr::mutate(district = as.character(district)) %>%
	# dplyr::group_by(state, district) %>%
	# dplyr::summarize(
	# N = n(),
	# total_votes = sum(general_votes, na.rm = TRUE),
	# d_votes = sum(ifelse(party == "D", general_votes, 0), na.rm = TRUE),
	# r_votes = sum(ifelse(party == "R", general_votes, 0), na.rm = TRUE)
	# ) %>%
	# dplyr::mutate(
	# other_votes = total_votes - d_votes - r_votes,
	# r_prop = r_votes / total_votes,
	# winner = ifelse(r_votes > d_votes, "Republican", "Democrat")
	# )
	#
	# district_elections$district <- stringr::str_pad(district_elections$district,width=2, side="left", pad="0")
	#
	# saveRDS(district_elections,"data/general/district_elections.Rds")

	district_elections <- readRDS("data/general/district_elections.Rds")

	# FIPS2STATENAME$STATEFP <- str_pad(FIPS2STATENAME$STATEFP, width=2, side="left", pad="0")
	# saveRDS(FIPS2STATENAME, "data/general/FIPS2STATENAME.Rds")

	# Read in congressional districts data
	#districts <- readRDS('data/general/districts.Rds') # TODO: Use new shapes!
	# districts_new <- congressional_districts(year=2020) # Replace with readRDS()

	#districts <- readRDS('data/general/districts_new.Rds') # CD116 districts; don't use!
	districts <- readRDS('data/cd118_shapes/districts_cd118.Rds') # Via https://simplemaps.com/data/congress

	# districts <- districts %>%
	# dplyr::select(-c("STATENAME","Abbreviation")) %>%
	# left_join(FIPS2STATENAME, by="STATEFP") %>%
	# relocate(STATENAME, .after = STATEFP) %>%
	# left_join(STATENAME2ABB, by="STATENAME") %>%
	# relocate(Abbreviation, .after = STATENAME)

	#saveRDS(districts, "data/general/districts_new.Rds")

	# Function
	generate_cong_districts <- function(state_code) {
	# State congressional district elections data
	state_elections <- district_elections %>%
	dplyr::filter(state == state_code)

	# State shape data
	state_name <- state_choices[which(state_choices$Abbreviation == state_code), 1]
	state_shape <- districts %>%
	dplyr::filter(STATENAME == state_name) %>%
	dplyr::mutate(ID = GEOID) %>%
	dplyr::select(STATENAME, ID, DISTRICT)

	# Merge
	state_merged <- state_shape %>%
	st_transform(4326) %>% # Using CRS 4326
	inner_join(state_elections, by = c('DISTRICT' = 'district'))

	# Name and save the dataset
	state_abbr <- state_choices[which(state_choices$Abbreviation == state_code), 3]
	file_path <- paste(state_abbr, '.Rds', sep = '')
	file_path <- paste('data/cong_districts/', file_path, sep = '')
	saveRDS(state_merged, file_path)
	}

	# Generate congressional districts data for individual states ==========
	for (i in 1: nrow(state_choices)) {
	state_code <- state_choices[i, 2]
	generate_cong_districts(state_code)
	}


	# Reservoir data ==========
	# Read in reservoirs data
	reservoirs <- readRDS("data/general/reservoirs.Rds")

	# Function
	generate_reservoirs <- function(state_code) {
	# Filter to state reservoirs
	state_reservoirs <- reservoirs %>%
	filter(state == state_code)

	# Read in state lowmod data for spatial merge
	state_abbr <- state_choices[which(state_choices$Abbreviation == state_code), 3]
	lowmod_path <- paste('data/lowmod/', state_abbr, sep = '')
	lowmod_path <- paste(lowmod_path, '.Rds', sep = '')
	state_lowmod_data <- readRDS(lowmod_path)

	# Convert reservoir data to shape file; transform coordinates
	state_reservoirs_spatial <- state_reservoirs %>%
	st_as_sf(coords=c('long', 'lat'), crs = '+proj=longlat +datum=WGS84') %>%
	st_transform(crs = st_crs(state_lowmod_data))

	# saveRDS(state_reservoirs_spatial, "state_reservoirs_spatial.Rds") # For debug

	# Spatial join reservoir data and lowmod data; filter columns
	# TODO: This is producing some NAs; some reservoir points not overlapping with census blocks
	points_in_state <- st_join(state_reservoirs_spatial, state_lowmod_data, left = T)
	points_in_state <- points_in_state[, c('dam_name', 'Lowmod_pct')]

	# saveRDS(points_in_state, "points_in_state.Rds") # for debug

	# Merge back with the original reservoir data
	state_reservoirs_merged <- base::merge(state_reservoirs, points_in_state, by = 'dam_name')

	# Add low mod percentage buckets
	state_reservoirs_merged <- state_reservoirs_merged %>% mutate(
	lowmod_bucket = case_when(
	Lowmod_pct < 20 ~ '0 - 20',
	Lowmod_pct >= 20 & Lowmod_pct < 40 ~ '20 - 40',
	Lowmod_pct >= 40 & Lowmod_pct < 60 ~ '40 - 60',
	Lowmod_pct >= 60 & Lowmod_pct < 80 ~ '60 - 80',
	Lowmod_pct >= 80 ~ '80 - 100',
	is.na(Lowmod_pct) ~ 'No LMI Data'
	)
	)

	# Name and save the dataset
	file_path <- paste(state_abbr, '.Rds', sep = '')
	file_path <- paste('data/reservoirs/', file_path, sep = '')
	saveRDS(state_reservoirs_merged, file_path)
	}

	# Generate reservoir data for individual states ==========
	for (i in 1: nrow(state_choices)) {
	state_code <- state_choices[i, 2]
	generate_reservoirs(state_code)
	}



	# Electric substation data ==========
	# Read in substation data
	substations <- read.csv('data/general/electric_substations.csv')

	# Function
	generate_substations <- function(state_code) {
	# Filter to state substations
	state_substations <- substations %>%
	filter(STATE == state_code) %>%
	# get rid of useless columns
	dplyr::select(-c('NAICS_CODE', 'NAICS_DESC', 'COUNTRY', 'SOURCE',
	'SOURCEDATE', 'VAL_METHOD', 'VAL_DATE', 'LINES',
	'MAX_INFER', 'MIN_INFER', 'X', 'Y')) %>%
	# take out unavailable substations
	filter(STATUS != 'NOT AVAILABLE')

	# Name and save the dataset
	state_abbr <- state_choices[which(state_choices$Abbreviation == state_code), 3]
	file_path <- paste(state_abbr, '.Rds', sep = '')
	file_path <- paste('data/substations/', file_path, sep = '')
	saveRDS(state_substations, file_path)
	}

	# Generate substation data for individual states ==========
	for (i in 1: nrow(state_choices)) {
	state_code <- state_choices[i, 2]
	generate_substations(state_code)
	}



	# Generate ALL DATA ==========
	# Or run previous for-loops individually for each data type
	for (i in 1: nrow(state_choices)) {
	state_code <- state_choices[i, 2]
	generate_lowmod_data(state_code)
	generate_cong_districts(state_code)
	generate_reservoirs(state_code)
	}