############################################################################
#version history

#v1 - original code developed on preliminary data
#v2 - cleaned up final version of the code to run on full time series (8/2025)
#v3 - integrated automated analysis window from SOTW year (t-1 ... t-5), toggle for comparison plots, added year to output filenames


############################################################################
# Load necessary libraries
library(lubridate)
library(dplyr)
library(stringr)
library(readxl)
library(writexl)
library(ggplot2)
library(readr)
library(dplyr)
library(tidyr)
library(plotly)

############################################################################
#user specifications
setwd("C:/Users/Michael Palmer/Association to Preserve Cape Cod, Inc/Programs & Projects - STATE of the Waters/Data Analysis/CCC_data_processing/final_2015_2024")

# data_input1 <- read_excel("APCC_SOTW_Ponds_2015_2022.xlsx")
# data_input2 <- read_excel("APCC_SOTW_Ponds_2023_2024.xlsx")
data_input <- read.csv("20250814_APCC_SOTW_Ponds_2015-2024.csv")
data_ponds <- read_excel("CCC_Ponds_datafile_20250709.xlsx")

SOTW.year <- 2021
SOTW.results.comparison <- "Y" #(y/n) toggle to control whether to run comparison between old (APCC) and new (CCC) scores
data.SOTW.old <- "SOTW_CTI_scores_2022_2024.xlsx"

############################################################################
# DON'T TYPE BELOW THIS LINE
############################################################################
#STEP 1. Basic data formatting
# Rename headers
names(data_input) <- c("Datetime", "Name", "Station_name", "Sample_depth",
                       "Parameter_name", "Parameter_short", "Analysis_method", "Value",
                       "Measuring_program", "Sample_replicate") 

# Reformat datetime using lubridate (auto format detection)
data_input$Datetime <- as.POSIXct(data_input$Datetime, format = "%m/%d/%Y %H:%M")

# Extract Date from Datetime
data_input$Date <- as.Date(data_input$Datetime)
data_input$Year <- year(data_input$Date)

# Extract water body identifier from site_name
data_input$CCC_GIS_ID_station <- gsub(" ", "", sub(".*/", "", data_input$Station_name))

###########################################################################
#STEP 2. FILTER OUT MONTHS (AND YEARS OF INTEREST)
sotw.window <- (SOTW.year - 5):(SOTW.year - 1)

data_input_filtered <- data_input %>%
  filter(
    format(Date, "%m") %in% c("06", "07", "08", "09") &
      Year %in% c(sotw.window)
  )


###########################################################################
#STEP 3. Normalize the data structure into a more typical column display and implement basic QA/QC parameter thresholds
#Process SECCHI disk depth
secchi_data_raw <- data_input_filtered %>%
  filter(Parameter_short == 'SECCHI') %>%
  arrange(CCC_GIS_ID_station, Datetime, Sample_depth)

# Find minimum Value for each combination of CCC_GIS_ID and Datetime
secchi_data <- secchi_data_raw %>%
  group_by(CCC_GIS_ID_station, Year, Date, Datetime, Measuring_program) %>%
  summarise(Secchi_m = min(Value, na.rm = TRUE), .groups = "drop")


secchi_data <- secchi_data %>%
  mutate(
    Secchi_m = as.numeric(Secchi_m),  # ensure numeric
    Source = case_when(
      Measuring_program == "Cape Cod Regional Pond Monitoring Program" ~ "CCRPMP",
      Measuring_program == "Pond and Lake Stewards" ~ "PALS",
      Measuring_program == "Center for Coastal Studies - Pond Monitoring" ~ "CCS",
      TRUE ~ "Other"
    )
  )

#apply data quality ranges
secchi_data <- secchi_data %>%
  mutate(
    Secchi_m = as.numeric(Secchi_m),  # ensure numeric
    Secchi_m = ifelse(Secchi_m < 0 | Secchi_m > 20, NA_real_, Secchi_m)  # blank out values outside 0–850
  )

secchi_summary <- secchi_data %>%
  group_by(CCC_GIS_ID_station, Year) %>%
  summarise(Secchi_m = mean(Secchi_m, na.rm = TRUE), n_obs = n(), .groups = "drop")


#Process Total Phosphorous (TP) - doing conversions TP ugL internally
tp_data <- data_input_filtered %>%
  filter(Parameter_short == 'TP', !is.na(Value)) %>%
  mutate(
    TP_ugL = as.numeric(Value) * 30.973762,
    Source = case_when(
      Measuring_program == "Cape Cod Regional Pond Monitoring Program" ~ "CCRPMP",
      Measuring_program == "Pond and Lake Stewards" ~ "PALS",
      Measuring_program == "Center for Coastal Studies - Pond Monitoring" ~ "CCS",
      TRUE ~ "Other"
    )
  ) %>%
  arrange(CCC_GIS_ID_station, Date, Datetime, Analysis_method, Sample_depth)

#apply data quality ranges
tp_data <- tp_data %>%
  mutate(
    TP_ugL = as.numeric(TP_ugL),  # ensure numeric
    TP_ugL = ifelse(TP_ugL < 0 | TP_ugL > 850, NA_real_, TP_ugL)  # blank out values outside 0–850
  )

#calculate annual summaries
tp_summary <- tp_data %>%
  group_by(CCC_GIS_ID_station, Year) %>%
  summarise(TP_ugL = mean(TP_ugL, na.rm = TRUE), n_obs = n(), .groups = "drop")



#Process Chlorophyll A
chla_data <- data_input_filtered %>%
  filter(Parameter_short == 'CHLA', !is.na(Value)) %>%
  mutate(
    Chla_ugL = as.numeric(Value),  # Coerce Value to numeric
    Source = case_when(
      Measuring_program == "Cape Cod Regional Pond Monitoring Program" ~ "CCRPMP",
      Measuring_program == "Pond and Lake Stewards" ~ "PALS",
      Measuring_program == "Center for Coastal Studies - Pond Monitoring" ~ "CCS",
      TRUE ~ "Other"
    ),
    Chla_source = Analysis_method
  ) %>%
  arrange(CCC_GIS_ID_station, Date, Datetime, Analysis_method, Sample_depth)

#apply data quality ranges
chla_data <- chla_data %>%
  mutate(
    Chla_ugL = as.numeric(Chla_ugL),  # ensure numeric
    Chla_ugL = ifelse(Chla_ugL < 0 | Chla_ugL > 500, NA_real_, Chla_ugL)  # blank out values outside 0–850
  )

chla_summary <- chla_data %>%
  group_by(CCC_GIS_ID_station, Year) %>%
  summarise(Chla_ugL = mean(Chla_ugL, na.rm = TRUE), n_obs = n(), .groups = "drop")

###########################################################################
#STEP 4. Merge 3 summary files together and perform TSI calculations

#MERGE
TSI_merged_data <- secchi_summary %>%
  full_join(tp_summary, by = c("CCC_GIS_ID_station", "Year")) %>%
  full_join(chla_summary, by = c("CCC_GIS_ID_station", "Year")) %>%
  select(CCC_GIS_ID_station, Year, Secchi_m, TP_ugL, Chla_ugL)

#TSI CALCULATIONS
#calculate the annual TSI(SDD)
TSI_merged_data$Secchi_TSI <- 10 * (6 - (log(TSI_merged_data$Secchi_m) / log(2)))
TSI_merged_data$Chla_TSI <- 10 * (6-((2.04-0.68*log(TSI_merged_data$Chla_ugL)) / log(2)))
TSI_merged_data$TP_TSI <- 10 * (6-(log(48/TSI_merged_data$TP_ugL) / log(2)))
TSI_merged_data$TSI_avg <- apply(
    TSI_merged_data[, c("Secchi_TSI", "Chla_TSI", "TP_TSI")], 1, 
    function(x) if (all(!is.na(x))) mean(x) else NA)

#need to average across stations within a waterbody
#first decompose the sample CCC_GIS_ID to the waterbody level
TSI_merged_data$CCC_GIS_ID <- sub("^([^-]+-[^-]+)-.*", "\\1", TSI_merged_data$CCC_GIS_ID_station)

#then calculate the annual mean across water bodies
#note that only a few waterbodies have > 1 station and will be impacted by this averaging step
TSI_annual_pond_summary <- TSI_merged_data %>%
  group_by(CCC_GIS_ID, Year) %>%
  summarise(
    Mean_TSI = mean(TSI_avg, na.rm = TRUE),
    Stations = n(),
    .groups = "drop"
  )

#perform 5-year mean calculations from annual data
TSI_5yr_avg <- TSI_merged_data %>%
  group_by(CCC_GIS_ID) %>%
  summarise(
    # Keep only years with non-NA TSI_avg and get unique sorted years
    Years_included = paste(sort(unique(Year[!is.na(TSI_avg)])), collapse = ", "),
    
    # Calculate 5-year mean if at least 3 valid values
    TSI_5yr_avg = {
      valid_values <- TSI_avg[!is.na(TSI_avg)]
      if(length(valid_values) >= 3) mean(valid_values) else NA_real_
    },
    
    # Assign grade based on 5-year mean
    TSI_grade = ifelse(
      !is.na(TSI_5yr_avg) & TSI_5yr_avg < 50,
      "Acceptable; Ongoing Protection is Required",
      ifelse(!is.na(TSI_5yr_avg), 
             "Unacceptable; Immediate Restoration is Required", 
             NA_character_)
    ),
    
    .groups = "drop"  # optional, to ungroup after summarise
  )

#merge in water body info for presentation, final outputs
#Step 1: Create join key from TSI_5yr_avg (left of second dash)
TSI_5yr_avg <- TSI_5yr_avg %>%
  mutate(join_key = CCC_GIS_ID)

# Step 2: Create join key in data_ponds (use full CCC_GIS_ID)
data_ponds <- data_ponds %>%
  mutate(join_key = CCC_GIS_ID)

# Step 3: Perform the join using the temporary join_key
TSI_5yr_avg <- TSI_5yr_avg %>%
  left_join(
    data_ponds %>% select(join_key, Name, Town, Latitude, Longitude),
    by = "join_key"
  ) %>%
  select(-join_key)  # remove the temporary key if no longer needed

#reorder the columns
Final_grades <- TSI_5yr_avg %>%
  select(Name, Town, Latitude, Longitude, CCC_GIS_ID, TSI_5yr_avg, Years_included, TSI_grade)

pond_lookup <- Final_grades %>%  distinct(Name, Town, Latitude, Longitude, CCC_GIS_ID)

# Plot
ggplot(Final_grades, aes(x = TSI_5yr_avg, fill = TSI_grade)) +
  geom_histogram(binwidth = 5, color = "white", boundary = 0, closed = "left") +
  scale_fill_manual(
    values = c(
      "Acceptable; Ongoing Protection is Required" = "#2C77BF",  # Blue
      "Unacceptable; Immediate Restoration is Required" = "#D43F3A"  # Red
    )
  ) +
  labs(
    title = "Distribution of 5-Year Average TSI Scores",
    x = "TSI 5-Year Average",
    y = "Number of Water Bodies",
    fill = "Status"
  ) +
  theme_bw()

Grade_summary <- Final_grades %>%
  mutate(TSI_grade = if_else(is.na(TSI_grade), "Ungraded", TSI_grade)) %>%
  count(TSI_grade, name = "Count")

###########################################################################
#run comparison w/ prior SOTW scores generated using internally managed APCC data
if (SOTW.results.comparison == "Y") {

#import old scores
SOTW_data <- read_excel(data.SOTW.old)%>%filter(Year == SOTW.year)

# Rename the TSI_5yr_avg column in SOTW_data before joining
SOTW_data <- SOTW_data %>%
  select(CCC_GIS_ID, Year, TSI_5yr_avg) %>%
  rename(TSI_5yr_avg_OLD = TSI_5yr_avg)

# Join with Final_grades, keeping all rows from Final_grades
Scores_comparison <- Final_grades %>%
  left_join(SOTW_data, by = "CCC_GIS_ID") %>%
  select(Town, Name, CCC_GIS_ID, TSI_5yr_avg, TSI_5yr_avg_OLD) %>%
  mutate(
    TSI_5yr_avg = round(TSI_5yr_avg, 1),
    TSI_5yr_avg_OLD = round(TSI_5yr_avg_OLD, 1),
    relative_difference = round(((TSI_5yr_avg_OLD - TSI_5yr_avg) / TSI_5yr_avg) * 100, 1)  # % difference
  )

# Determine min and max for the 1:1 line
min_val <- min(Scores_comparison$TSI_5yr_avg, Scores_comparison$TSI_5yr_avg_OLD, na.rm = TRUE)
max_val <- max(Scores_comparison$TSI_5yr_avg, Scores_comparison$TSI_5yr_avg_OLD, na.rm = TRUE)

# Create scatter plot
fig <- plot_ly() %>%
  # Add 1:1 dashed line first (in the background)
  add_lines(
    x = c(min_val, max_val),
    y = c(min_val, max_val),
    line = list(dash = "dash", color = "black"),
    inherit = FALSE,
    showlegend = FALSE
  ) %>%
  # Add scatter points on top
  add_trace(
    data = Scores_comparison,
    x = ~TSI_5yr_avg,
    y = ~TSI_5yr_avg_OLD,
    type = 'scatter',
    mode = 'markers',
    color = ~Town,
    colors = "Set2",
    text = ~paste(
      "Name:", Name,
      "<br>Town:", Town,
      "<br>CCC_GIS_ID:", CCC_GIS_ID,
      "<br>Current score:", TSI_5yr_avg,
      "<br>Previous score (OLD):", TSI_5yr_avg_OLD
    ),
    hoverinfo = 'text',
    marker = list(size = 12)
  ) %>%
  layout(
    title = paste0("TSI 5-Year Average Comparison for ", SOTW.year),
    xaxis = list(title = "Current water body score"),
    yaxis = list(title = "Previous water body score (OLD)"),
    legend = list(title = list(text = "Town"))
  )

print(fig)

write_xlsx(
  list(
    Scores_comparison = Scores_comparison
  ),
  path = paste0("outputs/CTI_scores_comparison_", SOTW.year, ".xlsx")
)

  
}


#STEP 5. Export to Excel
write_xlsx(
  list(
    data_input = data_input,
    secchi_data = secchi_data,
    chla_data = chla_data,
    tp_data = tp_data,
    TSI_merged_data = TSI_merged_data,
    TSI_annual_pond_summary = TSI_annual_pond_summary,
    Final_grades = Final_grades,
    Grade_summary = Grade_summary
  ),
  path = paste0("outputs/CCC_CTI_output_", SOTW.year, ".xlsx")
)


###########################################################################
#STEP 6 (optional). Export to RDS files
# Save RDS files
saveRDS(data_input, paste0("outputs/fresh_data_input_", SOTW.year, ".rds"))
saveRDS(secchi_data, paste0("outputs/fresh_secchi_data_", SOTW.year, ".rds"))
saveRDS(chla_data, paste0("outputs/fresh_chla_data_", SOTW.year, ".rds"))
saveRDS(tp_data, paste0("outputs/fresh_tp_data_", SOTW.year, ".rds"))
saveRDS(TSI_merged_data, paste0("outputs/fresh_TSI_merged_data_", SOTW.year, ".rds"))
saveRDS(TSI_annual_pond_summary, paste0("outputs/fresh_TSI_annual_pond_summary_", SOTW.year, ".rds"))
saveRDS(Final_grades, paste0("outputs/fresh_final_grades_", SOTW.year, ".rds"))
saveRDS(Grade_summary, paste0("outputs/fresh_grade_summary_", SOTW.year, ".rds"))