Skip to content
This repository was archived by the owner on May 28, 2024. It is now read-only.

Update p2a targets to use NHDv2 #143

Merged
merged 6 commits into from
Jun 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
#Targets directry
_targets

# Certain input files
1_fetch/in/drb_climate_2022_06_14.nc

# Output
*/out/*

Expand Down
65 changes: 0 additions & 65 deletions 1_fetch.R
Original file line number Diff line number Diff line change
Expand Up @@ -135,76 +135,11 @@ p1_targets_list <- list(
format = "file"
),

# Download zipped shapefile of DRB PRMS reaches
tar_target(
p1_reaches_shp_zip,
# [Jeff] I downloaded this manually from science base:
# https://www.sciencebase.gov/catalog/item/5f6a285d82ce38aaa244912e
# Because it's a shapefile, it's not easily downloaded using sbtools
# like other files are (see https://github.com/USGS-R/sbtools/issues/277).
# Because of that and since it's small (<700 Kb) I figured it'd be fine to
# just include in the repo and have it loosely referenced to the sb item ^
"1_fetch/in/study_stream_reaches.zip",
format = "file"
),

# Unzip zipped shapefile
tar_target(
p1_reaches_shp,
{
shapedir = "1_fetch/out/study_stream_reaches"
# `shp_files` is a vector of all files ('dbf', 'prj', 'shp', 'shx')
shp_files <- unzip(p1_reaches_shp_zip, exdir = shapedir)
# return just the .shp file
grep(".shp", shp_files, value = TRUE)
},
format = "file"
),

# read shapefile into sf object
tar_target(
p1_reaches_sf,
st_read(p1_reaches_shp)
),

# Fetch NHDv2 flowline reaches for the area of interest
tar_target(
p1_nhd_reaches_sf,
download_nhdplus_flowlines(huc8 = drb_huc8s)
),

# fetch prms met data
tar_target(
p1_prms_met_data_zip,
download_sb_file(sb_id = "5f6a289982ce38aaa2449135",
file_name = "sntemp_inputs_outputs_drb.zip",
out_dir = "1_fetch/out"),
format = "file"
),

# unzip prms met data
tar_target(
p1_prms_met_data_csv,
{
unzip(zipfile = p1_prms_met_data_zip,exdir = dirname(p1_prms_met_data_zip), overwrite=TRUE)
file.path(dirname(p1_prms_met_data_zip), "sntemp_inputs_outputs_drb.csv")
},
format = "file"
),

# read in prms met data
tar_target(
p1_prms_met_data,
read_csv(p1_prms_met_data_csv, show_col_types = FALSE)
),

# read in prms met data
# [Jeff] I'm including these in the "in" folder because they are unpublished
# They are built in the delaware_model_prep pipeline (1_network/out/seg_attr_drb.feather)
tar_target(
p1_seg_attr_data,
arrow::read_feather("1_fetch/in/seg_attr_drb.feather")
),

# Read in csv file containing the segment/catchment attributes that we want
# to download from ScienceBase:
Expand Down
2 changes: 1 addition & 1 deletion 2_process.R
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ p2_targets_list <- list(
combine_attr_data(nhd_lines = p1_nhd_reaches_sf,
cat_attr_list = p2_cat_attr_list,
vaa_cols = c("SLOPE"),
sites_w_segs = p2_sites_w_nhd_segs)
sites_w_segs = p2_sites_w_segs)
),

# Subset the DRB meteorological data to only include the NHDPlusv2 catchments (COMID)
Expand Down
84 changes: 0 additions & 84 deletions 2_process/src/match_sites_reaches.R
Original file line number Diff line number Diff line change
@@ -1,87 +1,3 @@
#' original author: David Watkins
#' from https://code.usgs.gov/wwatkins/national-site-reach/-/blob/master/R/match_sites_reaches.R
#' modified by: Jeff Sadler
#' Match each site with a reach (seg_id/COMID)
#'
#' @description Function to match reaches (NHM flowlines) to point sites
#'
#' @param reach_sf sf object of reach polylines with column "subsegid"
#' @param sites dataframe with columns "lat" "lon"
#' @param sites_crs the crs of the sites table (i.e., 4269 for NAD83)
#' @param max_matches the maximum number of segments that a point can match to
#' @param search_radius the maximum radius in meters to match a point to a segment;
#' segments outside of search_radius will not match
#'
#' @value A data frame the same columns as the sites input dataframe with additional columns
#' of "segidnat", "subsegid" and "bird_dist_to_subseg_m" where "bird_dist_to_subseg_m" is the
#' distance (in meters) between the point and matching flowline
#'
get_site_flowlines <- function(reach_sf, sites, sites_crs,
max_matches = 1, search_radius = 500) {

# set up NHDPlus fields used by get_flowline_index
# Note: we are renaming the subsegid column to COMID because the nhdplusTools
# function requires an sf object with a "COMID" column. This does not have anything
# to do with the actual nhd COMIDs
# Note: the `ToMeas` and `FromMeas` are also required columns for the nhdplusTools
# function. Since we are using our own reaches and not the nhd, these do not have
# the same meaning as they would if we were using the nhd
reaches_nhd_fields <- reach_sf %>%
rename(COMID = subsegid) %>%
mutate(REACHCODE = COMID, ToMeas = 100, FromMeas = 100) %>%
# project reaches to Albers Equal Area Conic so that offsets returned by
# get_flowline_index are in meters rather than degrees
st_transform(5070)

sites_sf <- sites %>%
rowwise() %>%
filter(!is.na(lon), !is.na(lat)) %>%
mutate(Shape = list(st_point(c(lon, lat), dim = "XY"))) %>%
st_as_sf() %>%
st_set_crs(sites_crs) %>%
st_transform(st_crs(reaches_nhd_fields)) %>%
st_geometry()

message('matching flowlines with reaches...')
# Below, precision indicates the resolution of measure precision (in meters) in the output;
# since we are interested in a more accurate estimate of the `offset` distance between a
# point and the matched reach, set precision to 1 m.
# Conduct initial search using a larger radius (search_radius*2) than specified to
# account for any uncertainty in the RANN::nn2 nearest neighbor search. Then
# filter sites to include those within the specified search_radius.
flowline_indices <- nhdplusTools::get_flowline_index(flines = reaches_nhd_fields,
points = sites_sf,
max_matches = max_matches,
search_radius = search_radius*2,
precision = 1) %>%
select(COMID, id, offset) %>%
rename(subsegid = COMID, bird_dist_to_subseg_m = offset) %>%
filter(bird_dist_to_subseg_m <= search_radius)

# nhdplusTools returns an "id" column which is just an index from 1 to
# the number of sites. To later join to the site-ids, we need to add
# a matching index column.
sites <- rowid_to_column(sites, "id")

#rejoin to original reaches df
message("rejoining with other geometries")
sites_w_reach_ids <- sites %>%
# only retain sites that got matched to flowlines and are
# within specified search_radius
right_join(flowline_indices, by = "id") %>%
select(-id) %>%
# add `segidnat` column
left_join(reach_sf %>%
select(subsegid, segidnat) %>%
sf::st_drop_geometry(),
by = "subsegid")

return(sites_w_reach_ids)
}




#' @description Function to match reaches (NHDv2 flowlines) to point sites
#'
#' @param sites sf data frame containing site locations
Expand Down
25 changes: 13 additions & 12 deletions 2a_model.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,19 @@ p2a_targets_list <- list(
# join met data with light input data
tar_target(
p2a_met_light_data,
p1_prms_met_data %>%
left_join(p2_daily_max_light %>%
# omit subseg's not included in met data
filter(!subsegid %in% c("3_1","8_1","51_1")) %>%
select(seg_id_nat, date_localtime, frac_light) %>%
p2_met_data_at_obs_sites %>%
mutate(date = as.Date(time, tz = 'Etc/GMT+5')) %>%
left_join(y = p2_daily_max_light %>%
select(COMID, date_localtime, frac_light) %>%
# format column names
rename(light_ratio = frac_light,
date = date_localtime),
by = c("seg_id_nat", "date"))
by = c("COMID", "date")) %>%
select(-time) %>%
relocate(date, .after = COMID)
),

# match to site_ids to seg_ids
# match site_ids to seg_ids
tar_target(
p2a_met_data_w_sites,
match_site_ids_to_segs(p2a_met_light_data, p2_sites_w_segs)
Expand All @@ -26,7 +27,7 @@ p2a_targets_list <- list(
# match seg attributes with site_ids
tar_target(
p2a_seg_attr_w_sites,
match_site_ids_to_segs(p1_seg_attr_data, p2_sites_w_segs)
match_site_ids_to_segs(p2_seg_attr_data, p2_sites_w_segs)
),

# join the metab data with the DO observations
Expand Down Expand Up @@ -72,16 +73,16 @@ p2a_targets_list <- list(

## WRITE OUT PARTITION INPUT AND OUTPUT DATA ##
# write met and seg attribute data for trn/val sites to zarr
# note - I have to subset inputs to only include the train/val sites before passing to subset_and_write_zarr or else I
# get a memory error on the join
# note - I have to subset inputs to only include the train/val sites before
# passing to subset_and_write_zarr or else I get a memory error on the join

# write trn and val input and output data to zarr
tar_target(
p2a_well_obs_data,
{
inputs <- p2a_met_data_w_sites %>%
filter(site_id %in% p2a_trn_val_sites) %>%
inner_join(p2a_seg_attr_w_sites, by = c("site_id", "seg_id_nat"))
inner_join(p2a_seg_attr_w_sites, by = c("site_id", "COMID"))

inputs_and_outputs <- inputs %>%
left_join(p2a_do_and_metab, by=c("site_id", "date"))
Expand Down Expand Up @@ -155,7 +156,7 @@ p2a_targets_list <- list(
# include all med-obs sites not in testing sites
filter(site_id %in% p2_med_observed_sites,
!site_id %in% tst_sites) %>%
inner_join(p2a_seg_attr_w_sites, by = c("site_id","seg_id_nat"))
inner_join(p2a_seg_attr_w_sites, by = c("site_id","COMID"))

inputs_and_outputs_med_obs <- inputs_med_obs %>%
left_join(p2a_do_and_metab, by = c("site_id", "date"))
Expand Down
23 changes: 17 additions & 6 deletions 2a_model/src/model_ready_data_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,26 @@ match_site_ids_to_segs <-
#'
#' @description match site ids to segment data (e.g., met or attributes)
#'
#' @param seg_data a data frame of meterological data with column 'seg_id_nat'
#' @param sites_w_segs a dataframe with both segment ids ('segidnat') and site ids ('site_id')
#' @param seg_data a data frame of meterological data with either column
#' seg_id_nat' or 'COMID'
#' @param sites_w_segs a dataframe with both segment ids ('segidnat' or 'COMID')
#' and site ids ('site_id')
#'
#' @value A data frame of seg data with site ids

seg_data <- seg_data %>%
left_join(sites_w_segs[,c("site_id","segidnat")],
by=c("seg_id_nat" = "segidnat"))
return(seg_data)
if(any(grepl('COMID', names(seg_data)))){
seg_data_out <- seg_data %>%
mutate(COMID = as.character(COMID)) %>%
left_join(y = sites_w_segs[,c("site_id","COMID")],
by = "COMID") %>%
arrange(site_id)
} else {
seg_data_out <- seg_data %>%
left_join(sites_w_segs[,c("site_id","segidnat")],
by=c("seg_id_nat" = "segidnat"))
}

return(seg_data_out)
}

write_df_to_zarr <- function(df, index_cols, out_zarr) {
Expand Down
5 changes: 0 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
# drb-do-ml
Code repo for Delaware River Basin machine learning models that predict dissolved oxygen

# References

Study area segment shapefiles from:
```
Oliver, S.K., Appling, A.A., Atshan, R., Watkins, W.D., Sadler, J., Corson-Dosch, H., Zwart, J.A., and Read, J.S., 2021, Data release: Predicting water temperature in the Delaware River Basin: U.S. Geological Survey data release, https://doi.org/10.5066/P9GD8I7A.
```