USGS-R · lekoenig · Jun 30, 2022 · Jun 29, 2022 · Jun 29, 2022 · Jun 30, 2022
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,9 @@
 #Targets directry
 _targets
 
+# Certain input files
+1_fetch/in/drb_climate_2022_06_14.nc
+
 # Output
 */out/*
 

diff --git a/1_fetch.R b/1_fetch.R
@@ -135,76 +135,11 @@ p1_targets_list <- list(
     format = "file"
   ),
 
-  # Download zipped shapefile of DRB PRMS reaches
-  tar_target(
-    p1_reaches_shp_zip,
-    # [Jeff] I downloaded this manually from science base: 
-    # https://www.sciencebase.gov/catalog/item/5f6a285d82ce38aaa244912e
-    # Because it's a shapefile, it's not easily downloaded using sbtools
-    # like other files are (see https://github.com/USGS-R/sbtools/issues/277).
-    # Because of that and since it's small (<700 Kb) I figured it'd be fine to
-    # just include in the repo and have it loosely referenced to the sb item ^
-    "1_fetch/in/study_stream_reaches.zip",
-    format = "file"
-  ),
-
-  # Unzip zipped shapefile
-  tar_target(
-    p1_reaches_shp,
-    {
-      shapedir = "1_fetch/out/study_stream_reaches"
-      # `shp_files` is a vector of all files ('dbf', 'prj', 'shp', 'shx')
-      shp_files <- unzip(p1_reaches_shp_zip, exdir = shapedir)
-      # return just the .shp file
-      grep(".shp", shp_files, value = TRUE)
-    },
-    format = "file"
-  ),
-
-  # read shapefile into sf object
-  tar_target(
-    p1_reaches_sf,
-    st_read(p1_reaches_shp)
-  ),
-
   # Fetch NHDv2 flowline reaches for the area of interest
   tar_target(
     p1_nhd_reaches_sf,
     download_nhdplus_flowlines(huc8 = drb_huc8s)
   ),
-
-  # fetch prms met data
-  tar_target(
-    p1_prms_met_data_zip,
-    download_sb_file(sb_id = "5f6a289982ce38aaa2449135",
-                     file_name = "sntemp_inputs_outputs_drb.zip",
-                     out_dir = "1_fetch/out"),
-    format = "file"
-  ),
-
-  # unzip prms met data
-  tar_target(
-    p1_prms_met_data_csv,
-    {
-    unzip(zipfile = p1_prms_met_data_zip,exdir = dirname(p1_prms_met_data_zip), overwrite=TRUE)
-    file.path(dirname(p1_prms_met_data_zip), "sntemp_inputs_outputs_drb.csv")
-    },
-    format = "file"
-  ),
-
-  # read in prms met data
-  tar_target(
-    p1_prms_met_data,
-    read_csv(p1_prms_met_data_csv, show_col_types = FALSE)
-  ),
-
-  # read in prms met data
-  # [Jeff] I'm including these in the "in" folder because they are unpublished
-  # They are built in the delaware_model_prep pipeline (1_network/out/seg_attr_drb.feather)
-  tar_target(
-    p1_seg_attr_data,
-    arrow::read_feather("1_fetch/in/seg_attr_drb.feather")
-  ),
 
   # Read in csv file containing the segment/catchment attributes that we want
   # to download from ScienceBase:

diff --git a/2_process.R b/2_process.R
@@ -197,7 +197,7 @@ p2_targets_list <- list(
    combine_attr_data(nhd_lines = p1_nhd_reaches_sf, 
                      cat_attr_list = p2_cat_attr_list,
                      vaa_cols = c("SLOPE"),
-                     sites_w_segs = p2_sites_w_nhd_segs)
+                     sites_w_segs = p2_sites_w_segs)
  ),
 
  # Subset the DRB meteorological data to only include the NHDPlusv2 catchments (COMID)

diff --git a/2_process/src/match_sites_reaches.R b/2_process/src/match_sites_reaches.R
@@ -1,87 +1,3 @@
-#' original author: David Watkins
-#' from https://code.usgs.gov/wwatkins/national-site-reach/-/blob/master/R/match_sites_reaches.R
-#' modified by: Jeff Sadler
-#' Match each site with a reach (seg_id/COMID)
-#' 
-#' @description Function to match reaches (NHM flowlines) to point sites 
-#'
-#' @param reach_sf sf object of reach polylines with column "subsegid" 
-#' @param sites dataframe with columns "lat" "lon"
-#' @param sites_crs the crs of the sites table (i.e., 4269 for NAD83)
-#' @param max_matches the maximum number of segments that a point can match to
-#' @param search_radius the maximum radius in meters to match a point to a segment;
-#' segments outside of search_radius will not match
-#'
-#' @value A data frame the same columns as the sites input dataframe with additional columns
-#' of "segidnat", "subsegid" and "bird_dist_to_subseg_m" where "bird_dist_to_subseg_m" is the 
-#' distance (in meters) between the point and matching flowline
-#' 
-get_site_flowlines <- function(reach_sf, sites, sites_crs, 
-                               max_matches = 1, search_radius = 500) {
-
-  # set up NHDPlus fields used by get_flowline_index
-  # Note: we are renaming the subsegid column to COMID because the nhdplusTools
-  # function requires an sf object with a "COMID" column. This does not have anything
-  # to do with the actual nhd COMIDs 
-  # Note: the `ToMeas` and `FromMeas` are also required columns for the nhdplusTools 
-  # function. Since we are using our own reaches and not the nhd, these do not have 
-  # the same meaning as they would if we were using the nhd
-  reaches_nhd_fields <- reach_sf %>%
-    rename(COMID = subsegid) %>%
-    mutate(REACHCODE = COMID, ToMeas = 100, FromMeas = 100) %>%
-    # project reaches to Albers Equal Area Conic so that offsets returned by 
-    # get_flowline_index are in meters rather than degrees
-    st_transform(5070)
-
-  sites_sf <- sites %>% 
-    rowwise() %>%
-    filter(!is.na(lon), !is.na(lat)) %>%
-    mutate(Shape = list(st_point(c(lon, lat), dim = "XY"))) %>%
-    st_as_sf() %>% 
-    st_set_crs(sites_crs) %>%
-    st_transform(st_crs(reaches_nhd_fields)) %>%
-    st_geometry()
-
-  message('matching flowlines with reaches...')
-  # Below, precision indicates the resolution of measure precision (in meters) in the output; 
-  # since we are interested in a more accurate estimate of the `offset` distance between a 
-  # point and the matched reach, set precision to 1 m.
-  # Conduct initial search using a larger radius (search_radius*2) than specified to 
-  # account for any uncertainty in the RANN::nn2 nearest neighbor search. Then 
-  # filter sites to include those within the specified search_radius.
-  flowline_indices <- nhdplusTools::get_flowline_index(flines = reaches_nhd_fields,
-                                                       points = sites_sf,
-                                                       max_matches = max_matches,
-                                                       search_radius = search_radius*2,
-                                                       precision = 1) %>%
-    select(COMID, id, offset) %>%
-    rename(subsegid = COMID, bird_dist_to_subseg_m = offset) %>%
-    filter(bird_dist_to_subseg_m <= search_radius)
-
-  # nhdplusTools returns an "id" column which is just an index from 1 to 
-  # the number of sites. To later join to the site-ids, we need to add
-  # a matching index column.
-  sites <- rowid_to_column(sites, "id")
-
-  #rejoin to original reaches df
-  message("rejoining with other geometries")
-  sites_w_reach_ids <- sites %>%
-    # only retain sites that got matched to flowlines and are 
-    # within specified search_radius
-    right_join(flowline_indices, by = "id") %>%
-    select(-id) %>%
-    # add `segidnat` column
-    left_join(reach_sf %>%
-                select(subsegid, segidnat) %>%
-                sf::st_drop_geometry(),
-              by = "subsegid")
-
-  return(sites_w_reach_ids)
-}
-
-
-
-
 #' @description Function to match reaches (NHDv2 flowlines) to point sites 
 #' 
 #' @param sites sf data frame containing site locations

diff --git a/2a_model.R b/2a_model.R
@@ -6,18 +6,19 @@ p2a_targets_list <- list(
   # join met data with light input data
   tar_target(
     p2a_met_light_data,
-    p1_prms_met_data %>%
-      left_join(p2_daily_max_light %>%
-                  # omit subseg's not included in met data
-                  filter(!subsegid %in% c("3_1","8_1","51_1")) %>%
-                  select(seg_id_nat, date_localtime, frac_light) %>%
+    p2_met_data_at_obs_sites %>%
+      mutate(date = as.Date(time, tz = 'Etc/GMT+5')) %>%
+      left_join(y = p2_daily_max_light %>%
+                  select(COMID, date_localtime, frac_light) %>%
                   # format column names
                   rename(light_ratio = frac_light,
                          date = date_localtime),
-                by = c("seg_id_nat", "date"))
+                by = c("COMID", "date")) %>%
+      select(-time) %>%
+      relocate(date, .after = COMID)
   ),
 
-  # match to site_ids to seg_ids
+  # match site_ids to seg_ids
   tar_target(
     p2a_met_data_w_sites,
     match_site_ids_to_segs(p2a_met_light_data, p2_sites_w_segs)
@@ -26,7 +27,7 @@ p2a_targets_list <- list(
   # match seg attributes with site_ids
   tar_target(
     p2a_seg_attr_w_sites,
-    match_site_ids_to_segs(p1_seg_attr_data, p2_sites_w_segs)
+    match_site_ids_to_segs(p2_seg_attr_data, p2_sites_w_segs)
   ),
 
   # join the metab data with the DO observations
@@ -72,16 +73,16 @@ p2a_targets_list <- list(
 
   ## WRITE OUT PARTITION INPUT AND OUTPUT DATA ##
   # write met and seg attribute data for trn/val sites to zarr
-  # note - I have to subset inputs to only include the train/val sites before passing to subset_and_write_zarr or else I
-  # get a memory error on the join
+  # note - I have to subset inputs to only include the train/val sites before 
+  # passing to subset_and_write_zarr or else I get a memory error on the join
 
   # write trn and val input and output data to zarr
   tar_target(
     p2a_well_obs_data,
     {
       inputs <- p2a_met_data_w_sites %>%
         filter(site_id %in% p2a_trn_val_sites) %>%
-        inner_join(p2a_seg_attr_w_sites, by = c("site_id", "seg_id_nat"))
+        inner_join(p2a_seg_attr_w_sites, by = c("site_id", "COMID"))
 
       inputs_and_outputs <- inputs %>%
           left_join(p2a_do_and_metab, by=c("site_id", "date"))
@@ -155,7 +156,7 @@ p2a_targets_list <- list(
         # include all med-obs sites not in testing sites
         filter(site_id %in% p2_med_observed_sites, 
                !site_id %in% tst_sites) %>%
-        inner_join(p2a_seg_attr_w_sites, by = c("site_id","seg_id_nat"))
+        inner_join(p2a_seg_attr_w_sites, by = c("site_id","COMID"))
 
       inputs_and_outputs_med_obs <- inputs_med_obs %>%
         left_join(p2a_do_and_metab, by = c("site_id", "date"))

diff --git a/2a_model/src/model_ready_data_utils.R b/2a_model/src/model_ready_data_utils.R
@@ -4,15 +4,26 @@ match_site_ids_to_segs <-
     #'
     #' @description match site ids to segment data (e.g., met or attributes) 
     #'
-    #' @param seg_data a data frame of meterological data with column 'seg_id_nat'
-    #' @param sites_w_segs a dataframe with both segment ids ('segidnat') and site ids ('site_id')
+    #' @param seg_data a data frame of meterological data with either column 
+    #' seg_id_nat' or 'COMID'
+    #' @param sites_w_segs a dataframe with both segment ids ('segidnat' or 'COMID') 
+    #' and site ids ('site_id')
     #'
     #' @value A data frame of seg data with site ids 
 
-    seg_data <- seg_data %>%
-      left_join(sites_w_segs[,c("site_id","segidnat")],
-                by=c("seg_id_nat" = "segidnat"))
-    return(seg_data)
+    if(any(grepl('COMID', names(seg_data)))){
+      seg_data_out <- seg_data %>%
+        mutate(COMID = as.character(COMID)) %>%
+        left_join(y = sites_w_segs[,c("site_id","COMID")],
+                  by = "COMID") %>%
+        arrange(site_id)
+    } else {
+      seg_data_out <- seg_data %>%
+        left_join(sites_w_segs[,c("site_id","segidnat")],
+                  by=c("seg_id_nat" = "segidnat"))
+    }
+
+    return(seg_data_out)
   }
 
 write_df_to_zarr <- function(df, index_cols, out_zarr) {

diff --git a/README.md b/README.md
@@ -1,9 +1,4 @@
 # drb-do-ml
 Code repo for Delaware River Basin machine learning models that predict dissolved oxygen
 
-# References
 
-Study area segment shapefiles from:
-```
-Oliver, S.K., Appling, A.A., Atshan, R., Watkins, W.D., Sadler, J., Corson-Dosch, H., Zwart, J.A., and Read, J.S., 2021, Data release: Predicting water temperature in the Delaware River Basin: U.S. Geological Survey data release, https://doi.org/10.5066/P9GD8I7A.
-```