############################################################################
#version history

#v1 - original code developed on preliminary data
#v2 - cleaned up final version of the code to run on full time series (8/2025)
#v3 - integrated automated analysis window from SOTW year (t-1 ... t-5), toggle for comparison plots, added year to output filename
#v4 - added in additional embayment summary info, and integrated Buzzards Bay Coalition scoring replacement (ingest and use BBC-derived scores directly, not our calculated scores)
#v5 - minor modifications to work off new station input file (1009)
#v6 - fix to v5 to resolve station join - need to have station join on inner join, not outer join

############################################################################
# Load necessary libraries
library(lubridate)
library(dplyr)
library(tidyr)
library(stringr)
library(readxl)
library(writexl)
library(ggplot2)
library(plotly)

############################################################################
#user specifications

setwd("C:/Users/Michael Palmer/Association to Preserve Cape Cod, Inc/Programs & Projects - STATE of the Waters/Data Analysis/CCC_data_processing/final_2015_2024")

data_input <- read.csv('20250815_APCC_SOTW_Marine_2015-2024.csv')
stations <- read_excel('Master_Coastal_Embayment_table_20251009.xlsx') %>%
  mutate(
    Station = as.character(Station),
    Station_number = as.character(Station_number)
  )
BBC_data <- read_excel('BBC_station_data_20250911.xlsx')

SOTW.year <- 2025
SOTW.results.comparison <- "N" #(y/n) toggle to control whether to run comparison between old (APCC) and new (CCC) scores
data.SOTW.old <- "SOTW_CE_scores_2020_2023.xlsx"

############################################################################
# DO NOT TYPE BELOW THIS LINE
############################################################################
#STEP 1. Basic data formatting
# Rename headers
names(data_input) <- c("Datetime", "Name", "Station_name", "Sample_depth",
                       "Parameter_name", "Parameter_short", "Analysis_method", "Value", "Source", "Sample_replicate") 

# Reformat datetime using lubridate (auto format detection)
data_input$Datetime <- as.POSIXct(data_input$Datetime, format = "%m/%d/%Y %H:%M")

# Extract Date from Datetime
data_input$Date <- as.Date(data_input$Datetime)
data_input$Year <- year(data_input$Date)

# Extract water body identifier from site_name
#data_input$CCC_GIS_ID <- gsub(" ", "", sub(".*/", "", data_input$Station_name))
data_input$Station <- gsub(" ", "", sub("/.*", "", data_input$Station_name))
data_input$Station_number <- gsub(" ", "", sub(".*/", "", data_input$Station_name))

data_input <- data_input %>%
  rename(
    Station_ID = Station_name,
    Station_name = Name
  )

station_lookup <- data_input %>%  distinct(Station_name, Station, Station_number, Station_ID)

###########################################################################
#STEP 2. FILTER OUT MONTHS (AND YEARS OF INTEREST)
sotw.window <- (SOTW.year - 5):(SOTW.year - 1)

data_input_filtered <- data_input %>%
filter(
  format(Date, "%m") %in% c("06", "07", "08", "09") &
    Year %in% c(sotw.window)
)

###########################################################################
#STEP 3. Normalize the data structure into a more typical column display
# Processing one parameter at time
# fields needed: turbidity/secchi, temperature, salinity, DO, Phaeo, Chla, DIN, TON (TN-DIN), 
# TEMP, DO, SAL, PHAEO, CHLA, TURB, SECCHI, TN_FINAL, DIN_FINAL, DON

#Calculate the %DO saturation - need a grid with temp, sal and then divide the observed do by the theoretical
data_wide <- data_input_filtered %>%
  # Filter to keep only the parameters of interest
  filter(Parameter_short %in% c("TEMP", "SAL", "DO_FINAL",
                                "SECCHI", "TURB", "PHAEO", "CHLA", "TN_FINAL",
                                "DIN_FINAL", "TON")) %>%
  # Group by ID columns and Parameter_short, then average duplicates
  group_by(Station, Station_name, Station_number, Datetime, Date, Year, Source, Sample_depth, Parameter_short) %>%
  summarise(Value = mean(Value, na.rm = TRUE), .groups = "drop") %>%
  # Pivot wider
  pivot_wider(
    id_cols = c(Station, Station_name, Station_number, Datetime, Date, Year, Source, Sample_depth),
    names_from = Parameter_short,
    values_from = Value
  ) %>%
  arrange(Station, Station_name, Station_number, Datetime, Source, Sample_depth)

#data_wide <- data_wide %>%  rename(TON_RAW = TON)




#IMPLEMENT QA/QC CONTROLS (ACCEPTABLE RANGES)
# Define the valid ranges for each variable
valid_ranges <- list(
  TEMP = c(0, 50),
  SAL = c(0, 35),
  DO_FINAL = c(0, 30),
  SECCHI = c(0, 20),
  TURB = c(0, 100),
  PHAEO = c(0, 400),
  CHLA = c(0, 500),
  TN_FINAL = c(0, 1000),
  DIN_FINAL = c(0, 800),
  TON = c(0, 1000)
)

# Count NAs for just these columns
na_counts_prior <- data_wide %>%
  summarise(across(all_of(names(valid_ranges)), ~ sum(is.na(.))))

# Apply the cleaning
data_wide <- data_wide %>%
  mutate(
    across(
      names(valid_ranges),
      ~ ifelse(.x < valid_ranges[[cur_column()]][1] | .x > valid_ranges[[cur_column()]][2], NA_real_, .x)
    )
  )

na_counts_post <- data_wide %>%
  summarise(across(all_of(names(valid_ranges)), ~ sum(is.na(.))))


#CALCULATE DO_THEOR
data_wide$DO_THEOR <- (exp(-173.4292 + 249.6339 * 100 / (data_wide$TEMP+273.15) + 143.3483 *
                           log((data_wide$TEMP+273.15) / 100) - 21.8492 *
                           (data_wide$TEMP+273.15) / 100 + (data_wide$SAL) *
                           (-0.033096 + 0.014259 * (data_wide$TEMP+273.15) / 100 - 0.0017 *
                              ((data_wide$TEMP+273.15) / 100)^2)) * 1.4276)

#CALCULATE DO_SAT
data_wide$DO_SAT <- data_wide$DO_FINAL/data_wide$DO_THEOR


#calculate total pigment
data_wide$TOTAL_PIGMENTS <- data_wide$PHAEO + data_wide$CHLA

#calculate total organic nitrogen (*note calculating this rather than using TON since TON is not widely
# available - have compared calculated values to the raw values and found good coherence)
data_wide$TON <- data_wide$TN_FINAL - data_wide$DIN_FINAL
data_wide$DIN <- data_wide$DIN_FINAL


#combine secchi and turb into a single column
data_wide <- data_wide %>%  mutate(SECCHI_TURB = coalesce(SECCHI, TURB))

###########################################################################
#STEP 4. CALCULATE EVENT (AND SOURCE) MEANS
data_event <- data_wide %>%
  group_by(Station, Station_name, Station_number, Date, Year, Source) %>%
  summarise(across(c(DO_SAT, TOTAL_PIGMENTS, SECCHI, TURB, DIN, TON),
                   \(x) mean(x, na.rm = TRUE)), .groups = "drop")


###########################################################################
#STEP 5. CALCULATE ANNUAL SOURCE MEANS
data_annual_source_means <- data_event %>%
  group_by(Station, Station_name, Station_number, Year, Source) %>%
  summarise(
    across(
      .cols = c(DO_SAT, TOTAL_PIGMENTS, SECCHI, TURB, DIN, TON),
      .fns = list(
        mean = ~ mean(.x, na.rm = TRUE),
        n    = ~ n_distinct(Date[!is.na(.x)])
      ),
      .names = "{.fn}_{.col}"
    )
  ) 

data_annual_source_means <- data_annual_source_means %>%  mutate(n_SECCHI_TURB = coalesce(n_SECCHI, n_TURB))

#prepare for score calculation by bringing in salt marsh indicator at station-level
data_annual_source_means <- data_annual_source_means %>%
  inner_join(
    stations %>% select(Station, Station_number, Salt_marsh, BBC_site),  # added BBC_site
    by = c("Station" = "Station", "Station_number" = "Station_number")
  ) %>%
  select(1:3, Salt_marsh, BBC_site, everything())  # place BBC_site after Salt_marsh

###########################################################################
#STEP 6. CALCULATE SCORES
#calculate scores using source-level annual means

#HANDLE THE SECCHI/TURBIDITY SCORING
data_score <- data_annual_source_means %>%
  mutate(TRANSP_SCORE = case_when(
    #SECCHI ENDPOINTS    
    str_detect(Source, "SMAST Marine") ~ (log(mean_SECCHI) - log(0.6)) / (log(3) - log(0.6)) * 100,
    str_detect(Source, "Buzzards Bay Coalition") ~ (log(mean_SECCHI) - log(0.6)) / (log(3) - log(0.6)) * 100,
    str_detect(Source, "Mashpee Wampanoag") ~ (log(mean_SECCHI) - log(0.6)) / (log(3) - log(0.6)) * 100,
    #TURBIDITY ENDPOINTS    
    str_detect(Source, "Center for Coastal Studies") ~ (log(mean_TURB) - log(17)) / (log(0.5) - log(17)) * 100,
    str_detect(Source, "WBNERR") ~ (log(mean_TURB) - log(17)) / (log(0.5) - log(17)) * 100,
    TRUE ~ NA_real_  # fallback if Source is something else or SECCHI_TURB is NA
  ))

#CALCULATE OTHER PARAMETER SCORES
#DO scoring varies depending on whether it is a salt marsh site
data_score <- data_score %>%
  mutate(
    DO_SAT_SCORE = case_when(
      Salt_marsh == "Y" ~ (log(mean_DO_SAT) - log(0.2)) / (log(0.7) - log(0.2)) * 100,
      TRUE ~ (log(mean_DO_SAT) - log(0.4)) / (log(0.9) - log(0.4)) * 100
    )
  )

data_score$TOTAL_PIGMENTS_SCORE <- (log(data_score$mean_TOTAL_PIGMENTS) - log(10)) / (log(3) - log(10)) * 100


#DIN scoring varies depending on whether it is a salt marsh site
data_score <- data_score %>%
  mutate(
    DIN_SCORE = case_when(
      Salt_marsh == "Y" ~ (log(mean_DIN) - log(20)) / (log(1) - log(20)) * 100,
      TRUE ~ (log(mean_DIN) - log(10)) / (log(1) - log(10)) * 100
    )
  )

data_score$TON_SCORE <- (log(data_score$mean_TON) - log(43)) / (log(20) - log(43)) * 100


#STEP 6 CALCULATE WEIGHTED AVERAGES of the scores using the n (prior to the floor/ceiling)
data_weighted_means <- data_score %>%
  group_by(Station, Station_name, Station_number, Salt_marsh, BBC_site, Year) %>%
  summarise(
    DO_SAT_SCORE = sum(DO_SAT_SCORE * n_DO_SAT, na.rm = TRUE) / sum(n_DO_SAT, na.rm = TRUE),
    TOTAL_PIGMENTS_SCORE = sum(TOTAL_PIGMENTS_SCORE * n_TOTAL_PIGMENTS, na.rm = TRUE) / sum(n_TOTAL_PIGMENTS, na.rm = TRUE),
    TRANSP_SCORE = sum(TRANSP_SCORE * n_SECCHI_TURB, na.rm = TRUE) / sum(n_SECCHI_TURB, na.rm = TRUE),
    DIN_SCORE = sum(DIN_SCORE * n_DIN, na.rm = TRUE) / sum(n_DIN, na.rm = TRUE),
    TON_SCORE = sum(TON_SCORE * n_TON, na.rm = TRUE) / sum(n_TON, na.rm = TRUE),
    .groups = "drop"
  )


#Null out salt marsh transparency scores (these aren't used at salt marsh sites)
data_weighted_means <- data_weighted_means %>%
  mutate(
    TRANSP_SCORE = case_when(
      Salt_marsh == "Y" ~ NA_real_,
      TRUE ~ TRANSP_SCORE
    )
  )

#APPLY FLOOR/CEILING TO ADJUST SCORE VALUES FALLING ABOVE/BELOW 0/100
cols <- c("TRANSP_SCORE", "TOTAL_PIGMENTS_SCORE", "DO_SAT_SCORE", "DIN_SCORE", "TON_SCORE")
data_weighted_means[cols] <- lapply(data_weighted_means[cols], function(x) {
  x <- ifelse(is.nan(x), NA, x)       # replace NaN with NA
  pmin(pmax(x, 0), 100)               # cap values between 0 and 100
})

#ADJUST PRECISION OF SCORE FIELDS
data_weighted_means[cols] <- lapply(data_weighted_means[cols], function(x) round(x, 1))

#CALCULATE FINAL EI SCORE USING RULES RE: PRESENCE OF >= 3 PARAMETERS, ONE OF WHICH IS NITROGEN
data_annual_score <- data_weighted_means %>%
  rowwise() %>%
  mutate(
    values = list(c_across(c(DO_SAT_SCORE, TOTAL_PIGMENTS_SCORE, TRANSP_SCORE, DIN_SCORE, TON_SCORE))),
    n_present = sum(!is.na(values)),
    has_nitrogen = !is.na(DIN_SCORE) || !is.na(TON_SCORE),
    EI_SCORE = if (n_present >= 3 && has_nitrogen) round(mean(unlist(values), na.rm = TRUE), 2) else NA_real_
  ) %>%
  select(-values, -n_present, -has_nitrogen) %>%
  ungroup()

###########################################################################
#STEP 6. Calculate the final EI score

# calculate the 5-year moving average for each station
data_annual_filtered <- data_annual_score

avg5yr_station_ei_prelim <- data_annual_filtered %>%
  group_by(Station_name, Station, Station_number, Salt_marsh, BBC_site) %>%
  summarize(
    n_years = n_distinct(Year[!is.na(EI_SCORE)]),
    EI_5yr_avg = if (n_years >= 3) {
      round(mean(EI_SCORE, na.rm = TRUE), 2)
    } else {
      NA_real_
    },
    Years_included = paste(sort(unique(Year[!is.na(EI_SCORE)])), collapse = ", "),
    .groups = "drop"
  ) %>%
  rename(Embayment_name = Station_name) %>%
  left_join(
    stations %>% select(Station, Station_number, Latitude, Longitude),
    by = c("Station", "Station_number")
  ) %>%
  select(
    Station, Embayment_name, Station_number, Salt_marsh, BBC_site,
    Latitude, Longitude,
    EI_5yr_avg, Years_included
  )

avg5yr_station_ei_prelim <- avg5yr_station_ei_prelim %>%
  filter(is.na(BBC_site))

#######################################################################
# integrate Buzzards Bay Coalition station-level data
BBC_selected <- BBC_data %>%
  filter(Year == SOTW.year - 1) %>%
  select(
    Station = 1,        # first column
    EI_5yr_avg = 5        # fourth column
  ) %>%
  inner_join(
    stations %>%
      filter(BBC_site == "Y") %>%   # filter for BBC sites only
      select(
        Station, Embayment_name, Station_number, Salt_marsh, BBC_site, Latitude, Longitude
      ),
    by = "Station"
  ) %>%
  mutate(Years_included = NA)   # add null column at the end

avg5yr_station_ei <- bind_rows(
  BBC_selected,
  avg5yr_station_ei_prelim
) %>%
  select(
    Station,
    Embayment_name,
    Station_number,
    Salt_marsh,
    BBC_site,
    Latitude,
    Longitude,
    EI_5yr_avg,
    Years_included
  )

#######################################################################
# assign grade to each station
avg5yr_station_ei$Status = ifelse(avg5yr_station_ei$EI_5yr_avg > 65,
                           'Acceptable; Ongoing Protection is Required',
                           'Unacceptable; Immediate Restoration is Required')

# roll up to the embayment status (if there is even 1 station with an Unacceptable grade, then the entire Embayment is graded as Unacceptable.
# If all stations in an embayment are graded as Acceptable, then that embayment is graded as Acceptable.)
embayment_status <- avg5yr_station_ei %>%
  group_by(Embayment_name) %>%
  summarize(
    Unacceptable_stations = paste(Station[!is.na(Status) & Status == "Unacceptable; Immediate Restoration is Required"], collapse = ", "),
    Acceptable_stations = paste(Station[!is.na(Status) & Status == "Acceptable; Ongoing Protection is Required"], collapse = ", "),
    Missing_status_stations = paste(Station[is.na(Status)], collapse = ", "),
    Lowest_EI_station = if(all(is.na(EI_5yr_avg))) NA_character_ else Station[which.min(EI_5yr_avg)],
    .groups = "drop"
  ) %>%
  mutate(
    Embayment_status = case_when(
      Unacceptable_stations != "" ~ "Unacceptable; Immediate Restoration is Required",
      Unacceptable_stations == "" & Acceptable_stations == "" ~ "Unknown",
      TRUE ~ "Acceptable; Ongoing Protection is Required"
    )
  )

###########################################################################
#run comparison w/ prior SOTW scores generated using internally managed APCC data
if (SOTW.results.comparison == "Y") {
  
  #import old scores
  SOTW_data <- read_excel(data.SOTW.old)
  
  # Dynamically create the column name
  col_name <- paste0("EI_score_", SOTW.year)
  
  # Select that column and rename it
  SOTW_data <- SOTW_data %>%
    select(Station, BBC_site, all_of(col_name)) %>%
    rename(EI_5yr_avg_OLD = all_of(col_name))
  
  # Keep only one row per Station in SOTW_data (e.g., using the first row) -
  # this was needed in previous iterations, but may not be needed in final (but doesn't hurt)
  SOTW_data_unique <- SOTW_data %>%
    group_by(Station) %>%
    slice(1) %>%
    ungroup()
  
  # Perform the left join safely
  Scores_comparison <- avg5yr_station_ei %>%
    left_join(SOTW_data_unique, by = "Station") %>%
    select(Station, Embayment_name, Station_number, BBC_site, EI_5yr_avg, EI_5yr_avg_OLD) %>%
    mutate(
      EI_5yr_avg = round(EI_5yr_avg, 1),
      EI_5yr_avg_OLD = round(EI_5yr_avg_OLD, 1),
      relative_difference = round(((EI_5yr_avg_OLD - EI_5yr_avg) / EI_5yr_avg) * 100, 1)  # % difference
    )
  
  min_val <- min(Scores_comparison$EI_5yr_avg, Scores_comparison$EI_5yr_avg_OLD, na.rm = TRUE)
  max_val <- max(Scores_comparison$EI_5yr_avg, Scores_comparison$EI_5yr_avg_OLD, na.rm = TRUE)
  
  # Create a data frame for the 1:1 line
  line_df <- data.frame(x = c(min_val, max_val), y = c(min_val, max_val))
  

  # Create scatter plot
  compare_plot_df <- Scores_comparison
  compare_plot_df$EI_BBC <- ifelse(compare_plot_df$BBC_site == "Y", compare_plot_df$EI_5yr_avg, NA)
  
  fig <- plot_ly() %>%
    # 1:1 dashed line
    add_trace(
      data = line_df,
      x = ~x,
      y = ~y,
      type = "scatter",
      mode = "lines",
      line = list(dash = "dash", color = "black"),
      showlegend = FALSE
    ) %>%
    # All points in orange
    add_trace(
      data = compare_plot_df,
      x = ~EI_5yr_avg,
      y = ~EI_5yr_avg_OLD,
      type = "scatter",
      mode = "markers",
      marker = list(size = 12, color = "orange"),
      text = ~paste("Embayment:", Embayment_name, "<br>Station:", Station),
      name = "Other Sites"  # << legend entry
    ) %>%
    # BBC points in blue
    add_trace(
      data = compare_plot_df,
      x = ~EI_BBC,
      y = ~EI_5yr_avg_OLD,
      type = "scatter",
      mode = "markers",
      marker = list(size = 12, color = "blue"),
      text = ~paste("Embayment:", Embayment_name, "<br>Station:", Station),
      name = "BBC Site"     # << legend entry
    ) %>%
    layout(
      title = paste0("EI 5-Year Average Comparison for ", SOTW.year),
      xaxis = list(title = "Current water body score"),
      yaxis = list(title = "Previous water body score (OLD)")
    )
  
  print(fig)
  
  write_xlsx(
    list(
      Scores_comparison = Scores_comparison
    ),
    path = paste0("outputs/CE_scores_comparison_", SOTW.year, ".xlsx")
  )
  
    
}

###########################################################################
#STEP 6. Export to Excel
write_xlsx(
  list(
    data_input = data_input,
    data_wide = data_wide,
    data_event = data_event,
    data_annual_source_means = data_annual_source_means,
    data_score = data_score,
    data_annual_score = data_annual_score,
    avg5yr_station_ei = avg5yr_station_ei,
    embayment_status = embayment_status
  ),
  path = paste0("outputs/CCC_CE_output_", SOTW.year, ".xlsx")
)

###########################################################################
#STEP 6 (optional). Export to RDS files
saveRDS(data_input, paste0("outputs/ce_data_input_", SOTW.year, ".rds"))
saveRDS(data_wide, paste0("outputs/ce_data_wide_", SOTW.year, ".rds"))
saveRDS(data_event, paste0("outputs/ce_event_", SOTW.year, ".rds"))
saveRDS(data_annual_source_means, paste0("outputs/ce_data_annual_source_means_", SOTW.year, ".rds"))
saveRDS(data_score, paste0("outputs/ce_score_", SOTW.year, ".rds"))
saveRDS(data_annual_score, paste0("outputs/ce_annual_score_", SOTW.year, ".rds"))
saveRDS(avg5yr_station_ei, paste0("outputs/ce_avg5yr_station_ei_", SOTW.year, ".rds"))
saveRDS(embayment_status, paste0("outputs/ce_embayment_status_", SOTW.year, ".rds"))
