Threshold-Detection/hake_example.Rmd at main · nwfsc-cb/Threshold-Detection · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "hake_example"
author: "Raine Detmer"
date: "2023-12-12"
output: html_document
---


README: Code for analyses of spatial hake-temperature relationships and simulations based on these empirical data.


```{r}
library("mgcv")
library("MuMIn") # for AICc
library("gratia")
library("tidyverse")
library("pracma") # for findpeaks function
#library("data.table") #for between() function

library("tictoc") # for timing

# look for thresholds
source("load_functions.R") # loads all the functions in the "functions" folder

options(dplyr.summarise.inform = FALSE)# get rid of the "summarize has grouped output by..." warnings


```


# read in the data


```{r}

hake_tot <- read.csv("hake data/hake_tot.csv") %>% select(-X) # prop. total hake biomass in Canada, remove index column
hake_age5_20 <- read.csv("hake data/hake_age5_20.csv")%>% select(-X) # prop. age 5-10 hake biomass in Canada
hake_age3_4 <- read.csv("hake data/hake_age3_4.csv")%>% select(-X) # prop. age 3-4 hake biomass in Canada
hake_age2 <- read.csv("hake data/hake_age2.csv")%>% select(-X) # prop. age 2 hake biomass in Canada

# year = year of survey
# temp_100_anom = spatially interpolated temperature at 100 m, averaged across all transects between 45 and 49ºN
# xx_CA_US = prop. of xx hake in Canada
# temp_100_anom_z = standardized (mean of zero and sd of one) temp_100_anom
# xx_CA_US_z  = standardized xx_CA_US

# join altogether
#View(hake_tot)

hake_all <- left_join(hake_tot, hake_age5_20, by = c("year", "temp_100_anom", "temp_100_anom_z")) %>% left_join(hake_age3_4, by = c("year", "temp_100_anom", "temp_100_anom_z")) %>% left_join(hake_age2, by = c("year", "temp_100_anom", "temp_100_anom_z"))


```


# threshold detection steps

the functions used for these steps are in emp_functions.R

## linearity check

check for evidence of a nonlinear relationship between the driver (mean temperature anomaly at 100m) and the proportion of hake biomass in Canada for each of the hake age groups

```{r}

#lin_test(dt = hake_tot, xvar = "temp_100_anom", yvar = "wgt_total_CA_US")# gam better by 1.1
#lin_test(dt = hake_age5_20, xvar = "temp_100_anom", yvar = "age5_20_CA_US") # gam better by 1.1
#lin_test(dt = hake_age3_4, xvar = "temp_100_anom", yvar = "age3_4_CA_US") # exact same
#lin_test(dt = hake_age2, xvar = "temp_100_anom", yvar = "age2_CA_US") # exact same


lin_test(dt = hake_tot, xvar = "temp_100_anom", yvar = "wgt_total_CA_US", aic_type = "AICc")# lm better by 0.2
lin_test(dt = hake_age5_20, xvar = "temp_100_anom", yvar = "age5_20_CA_US", aic_type = "AICc") # lm better by 1.6
lin_test(dt = hake_age3_4, xvar = "temp_100_anom", yvar = "age3_4_CA_US", aic_type = "AICc") # exact same
lin_test(dt = hake_age2, xvar = "temp_100_anom", yvar = "age2_CA_US", aic_type = "AICc") # exact same


```

## gam and deriv fitting

calculate the GAM predictions and GAM derivatives for the full and jackknifed data sets for each hake age group

```{r}

length_val <- 50

tot_fit <- all_fits(dt =hake_tot, xvar = "temp_100_anom", yvar = "wgt_total_CA_US", xlength = length_val)

age5_20_fit <- all_fits(dt =hake_age5_20, xvar = "temp_100_anom", yvar = "age5_20_CA_US", xlength = length_val)

age3_4_fit <- all_fits(dt =hake_age3_4, xvar = "temp_100_anom", yvar = "age3_4_CA_US", xlength = length_val)

age2_fit <- all_fits(dt =hake_age2, xvar = "temp_100_anom", yvar = "age2_CA_US", xlength = length_val)

```


## threshold calculations

calculate the jackknifed threshold estimates for the total biomass and age 5-20 hake age groups

```{r}
tot_thresh <- thresh_calc(dt = hake_tot, xvar = "temp_100_anom", yvar = "wgt_total_CA_US", xlength = length_val, all_results = tot_fit)

age5_20_thresh <- thresh_calc(dt = hake_age5_20, xvar = "temp_100_anom", yvar = "age5_20_CA_US", xlength = length_val, all_results = age5_20_fit)

#View(age5_20_thresh$jack_summ)

#age5_20_thresh$tot_thresh
#age5_20_thresh$jack_summ

#age5_20_thresh$all_thresh

#View(tot_thresh$jack_summ)

```


# plot fits

get GAM and LM fits on the raw (not standardized scale) for plotting

```{r}

# driver values
hake_x <- seq(from = min(hake_all$temp_100_anom, na.rm = T), to = max(hake_all$temp_100_anom, na.rm = T), length.out = 100) # by 0.01

# gams fit on raw scale
tot_hake.gm <- gam(wgt_total_CA_US~s(temp_100_anom,k=4,bs="tp"),data = hake_all)

age5_20_hake.gm <- gam(age5_20_CA_US~s(temp_100_anom,k=4,bs="tp"),data = hake_all)

# age 3-4 hake
#age3_4_hake.lm <- lm(age3_4_CA_US ~ temp_100_anom, data = hake_all)
age3_4_hake.gm <- gam(age3_4_CA_US~s(temp_100_anom,k=4,bs="tp"),data = hake_all)

# age 2 hake
#age2_hake.lm <- lm(age2_CA_US ~ temp_100_anom, data = hake_all)
age2_hake.gm <- gam(age2_CA_US~s(temp_100_anom,k=4,bs="tp"),data = hake_all)

gam_list <- list(tot = tot_hake.gm, age5_20 = age5_20_hake.gm, age3_4 = age3_4_hake.gm, age2 = age2_hake.gm)

sub_names <- c("tot", "age5_20", "age3_4", "age2")

# get the mean and simultaneous intervals of these gam's predictions
for(i in 1:length(sub_names)){

  gam_pred <- gratia::fitted_values(gam_list[[i]], data = data.frame(temp_100_anom = hake_x))

  gam_df <- data.frame(
    temp_100_anom = hake_x,
    mn = gam_pred$fitted,
    up = gam_pred$upper,
    down = gam_pred$lower,
    hake_sub = rep(sub_names[i], length(hake_x))
  )

  if(i == 1){
    pred_df <- gam_df
  } else{
    pred_df <- rbind(gam_df, pred_df)
  }

}


# get the linear model for the age 5-20 group
age5_20_hake.lm <- gam(age5_20_CA_US~temp_100_anom,data = hake_all)

lm_pred <- stats::predict(age5_20_hake.lm, newdata = data.frame(temp_100_anom = hake_x), se.fit = TRUE)

  lm_df <- data.frame(
    temp_100_anom = hake_x,
    mn = lm_pred$fit,
    up = lm_pred$fit + lm_pred$se.fit,
    down = lm_pred$fit - lm_pred$se.fit,
    mod = rep("lm", length(hake_x)),
    hake_sub = rep(sub_names[i], length(hake_x))
  )


# get the threshold calculations
# note for some reason, the data need to be data frames not tibbles, otherwise length(dd[,1]) is calculated as 1 not the actual length of the data
hake_age5_20_sim <- hake_age5_20 %>% mutate(sim = 1, thresh_loc = NA, thresh_loc_z = NA)%>% rename(driver = temp_100_anom, obs_response =age5_20_CA_US)

#pred_5_20 <- jack_results(simdt = as.data.frame(hake_age5_20_sim), sim_choice = c(1), x_pred = NA, xlength=50, z_scale = FALSE)

thresh_5_20_summ <- jack_thresh(as.data.frame(hake_age5_20_sim), x_pred = NA, xlength = 50, z_scale = F)


```


look at the model summaries

```{r}

# linear model for the age 5-20 group
summary(age5_20_hake.lm)

# gam for the age 5-20 group
summary(age5_20_hake.gm)

# gam for the age 3-4 group
summary(age3_4_hake.gm)

# gam for the age 2 group
summary(age2_hake.gm)

# total
summary(tot_hake.gm)

```


## age 5-20

plot the linear and gam model fits with the data for the age 5-20 group

```{r}

lab_adj <- 0.04 # how high above points to put year label
ymax <- 0.75

pdf("figurepdfs/hake_fits_520_test.pdf", height = 3.5)
par(mfrow = c(1, 2))
par(mar=c(0.5, 1, 1, 0), oma = c(3, 2.5, 1, 0.5))

dt_sub <- pred_df[which(pred_df$hake_sub=="age5_20"),]

plot(x = hake_age5_20$temp_100_anom, y = hake_age5_20$age5_20_CA_US, xlab = NA, ylab = NA, pch = 16, col = adjustcolor("black", alpha.f = 0.75), las = 1, ylim = c(0, ymax), xaxt = "n", yaxt = "n")
axis(side = 1, at = seq(from = -0.4, to = 0.4, by = 0.2)) #, labels = NA
axis(side = 2, at = seq(from = 0, to = 0.6, by = 0.2), las = 1)
#axis(side = 2, at = seq(0, 0.6, by = 0.1), las = 1)
lines(x =hake_x, y = dt_sub$mn, lty = 2)
polygon(x = c(hake_x, rev(hake_x)), y = c(dt_sub$up, rev(dt_sub$down)), col = adjustcolor("gray20", alpha.f = 0.2), border = NA)
mtext(side = 3, "GAM")
mtext(side = 2, "Proportion biomass in Canada", line = 2.5)
mtext(side = 1, "Mean 100m temperature anomaly between 45-49ºN", line = 1.5, outer = T)
#mtext(side = 3, "Ages 5-20", line = -1.5)
# add the extreme points
for(i in 1:3){
text(x = hake_age5_20$temp_100_anom[which(hake_age5_20$temp_100_anom>0.25)][i]-0.01, y = hake_age5_20$age5_20_CA_US[which(hake_age5_20$temp_100_anom>0.25)][i] + lab_adj, as.character(hake_age5_20$year[which(hake_age5_20$temp_100_anom>0.25)][i]), cex =0.675)
}
abline(v = thresh_5_20_summ$thresh_mean[which(thresh_5_20_summ$thresh_method=="abs_max_d2"& thresh_5_20_summ$sig_type=="none")], col = "blue", lty = 1)

dt_sub <- lm_df

plot(x = hake_age5_20$temp_100_anom, y = hake_age5_20$age5_20_CA_US, xlab = NA, ylab = NA, pch = 16, col = adjustcolor("black", alpha.f = 0.75), las = 1, ylim = c(0, ymax), xaxt = "n", yaxt = "n")
axis(side = 1, at = seq(from = -0.4, to = 0.4, by = 0.2)) #, labels = NA
axis(side = 2, at = seq(from = 0, to = 0.6, by = 0.2), labels = NA)
mtext(side = 3, "Linear model")
#axis(side = 2, at = seq(0, 0.6, by = 0.1), las = 1)
lines(x =hake_x, y = dt_sub$mn, lty = 2)
polygon(x = c(hake_x, rev(hake_x)), y = c(dt_sub$up, rev(dt_sub$down)), col = adjustcolor("gray20", alpha.f = 0.2), border = NA)
#mtext(side = 3, "Ages 5-20", line = -1.5)
# add the extreme points
for(i in 1:3){
text(x = hake_age5_20$temp_100_anom[which(hake_age5_20$temp_100_anom>0.25)][i] - 0.01, y = hake_age5_20$age5_20_CA_US[which(hake_age5_20$temp_100_anom>0.25)][i] + lab_adj, as.character(hake_age5_20$year[which(hake_age5_20$temp_100_anom>0.25)][i]), cex =0.675)
}

dev.off()

# add threshold as a dashed arrow, in style of Samhouri and Large papers?


```


## all

plot the GAM fits for all age groups

```{r}

# get thresholds on raw scale (need the mean and sd of the drivers and responses for the invzfun)

ymin <- 0
ymax <- max(hake_all$wgt_total_CA_US) + 0.25

#lab_adj <- 0.04 # how high above points to put year label
lab_adj <- 0.022 # how high above points to put year label

# xvalues used for calculating thresholds on standardized scale
rangei <- max(hake_x, na.rm = T) - min(hake_x, na.rm = T)
hake_xs <- seq(from = min(hake_x, na.rm = T), to = max(hake_x, na.rm = T), length.out =max(100, rangei*50))# make sure length is at least 100


# plot results
#pdf("figurepdfs/hake_fits_all_test.pdf")
pdf("figurepdfs/hake_fits_all_years_labeled.pdf")
par(mfrow = c(2, 2))
par(mar=c(0.5, 1, 1, 0), oma = c(3, 2.5, 1, 0.5))
# total
dt_sub <- pred_df[which(pred_df$hake_sub=="tot"),]
plot(x = hake_tot$temp_100_anom, y = hake_tot$wgt_total_CA_US, xlab = NA, ylab = NA, pch = 16, col = adjustcolor("black", alpha.f = 0.75), las = 1, ylim = c(0, ymax), xaxt = "n")
axis(side = 1, at = seq(from = -0.4, to = 0.4, by = 0.2), labels = NA)
mtext(side = 2, "Proportion biomass in Canada", line = 1.25, outer = T)
lines(x =hake_x, y = dt_sub$mn, lty = 2)
polygon(x = c(hake_x, rev(hake_x)), y = c(dt_sub$up, rev(dt_sub$down)), col = adjustcolor("gray20", alpha.f = 0.2), border = NA)
mtext(side = 3, "Total hake biomass", line = -1.5)
# add the extreme points
# for(i in 1:3){
# text(x = hake_tot$temp_100_anom[which(hake_tot$temp_100_anom>0.25)][i], y = hake_tot$wgt_total_CA_US[which(hake_tot$temp_100_anom>0.25)][i] + lab_adj, as.character(hake_tot$year[which(hake_tot$temp_100_anom>0.25)][i]), cex =0.8)
# }
# add all points
for(i in 1:length(hake_tot$temp_100_anom)){
text(x = hake_tot$temp_100_anom[i], y = hake_tot$wgt_total_CA_US[i] + lab_adj, as.character(hake_tot$year[i]), cex =0.8)
}

# 5-20

dt_sub <- pred_df[which(pred_df$hake_sub=="age5_20"),]

plot(x = hake_age5_20$temp_100_anom, y = hake_age5_20$age5_20_CA_US, xlab = NA, ylab = NA, pch = 16, col = adjustcolor("black", alpha.f = 0.75), las = 1, ylim = c(0, ymax), xaxt = "n", yaxt = "n")
axis(side = 1, at = seq(from = -0.4, to = 0.4, by = 0.2), labels = NA)
axis(side = 2, at = seq(from = 0, to = 0.6, by = 0.2), labels = NA)
#axis(side = 2, at = seq(0, 0.6, by = 0.1), las = 1)
lines(x =hake_x, y = dt_sub$mn, lty = 2)
polygon(x = c(hake_x, rev(hake_x)), y = c(dt_sub$up, rev(dt_sub$down)), col = adjustcolor("gray20", alpha.f = 0.2), border = NA)
mtext(side = 3, "Ages 5-20", line = -1.5)
# add the extreme points
# for(i in 1:3){
# text(x = hake_age5_20$temp_100_anom[which(hake_age5_20$temp_100_anom>0.25)][i], y = hake_age5_20$age5_20_CA_US[which(hake_age5_20$temp_100_anom>0.25)][i] + lab_adj, as.character(hake_age5_20$year[which(hake_age5_20$temp_100_anom>0.25)][i]), cex =0.8)
# }
# add all points
for(i in 1:length(hake_age5_20$temp_100_anom)){
text(x = hake_age5_20$temp_100_anom[i], y = hake_age5_20$age5_20_CA_US[i] + 1.25*lab_adj, as.character(hake_age5_20$year[i]), cex =0.8)
}

# age 3-4
dt_sub <- pred_df[which(pred_df$hake_sub=="age3_4"),]

plot(x = hake_age3_4$temp_100_anom, y = hake_age3_4$age3_4_CA_US, xlab = NA, ylab = NA, pch = 16, col = adjustcolor("black", alpha.f = 0.75), las = 1, ylim = c(0, ymax))#, yaxt = "n"
#axis(side = 2, at = seq(0, 0.6, by = 0.1), las = 1)
mtext(side = 3, "Ages 3-4", line = -1.5)
mtext(side = 1, "Mean 100m temperature anomaly between 45-49ºN", line = 1.5, outer = T)
lines(x =hake_x, y = dt_sub$mn, lty = 2)
polygon(x = c(hake_x, rev(hake_x)), y = c(dt_sub$up, rev(dt_sub$down)), col = adjustcolor("gray20", alpha.f = 0.2), border = NA)

# for(i in 1:3){
# text(x = hake_age3_4$temp_100_anom[which(hake_age3_4$temp_100_anom>0.25)][i], y = hake_age3_4$age3_4_CA_US[which(hake_age3_4$temp_100_anom>0.25)][i] + lab_adj, as.character(hake_age3_4$year[which(hake_age3_4$temp_100_anom>0.25)][i]), cex =0.8)
# }

# add all points
for(i in 1:length(hake_age3_4$temp_100_anom)){
text(x = hake_age3_4$temp_100_anom[i], y = hake_age3_4$age3_4_CA_US[i] + lab_adj, as.character(hake_age3_4$year[i]), cex =0.8)
}


# age 2
dt_sub <- pred_df[which(pred_df$hake_sub=="age2"),]

plot(x = hake_age2$temp_100_anom, y = hake_age2$age2_CA_US, xlab = NA, ylab = NA, pch = 16, col = adjustcolor("black", alpha.f = 0.75), las = 1, ylim = c(0, ymax), yaxt = "n")
axis(side = 2, at = seq(0, 0.6, by = 0.2), labels = NA)
mtext(side = 3, "Age 2", line = -1.5)
#mtext(side = 1, "Mean 100m temp anomaly between 45-49ºN (standardized)", line = 0.5, outer = T)
lines(x =hake_x, y = dt_sub$mn, lty = 2)
polygon(x = c(hake_x, rev(hake_x)), y = c(dt_sub$up, rev(dt_sub$down)), col = adjustcolor("gray20", alpha.f = 0.2), border = NA)

# for(i in 1:3){
# text(x = hake_age2$temp_100_anom[which(hake_age2$temp_100_anom>0.25)][i], y = hake_age2$age2_CA_US[which(hake_age2$temp_100_anom>0.25)][i] + lab_adj, as.character(hake_age2$year[which(hake_age2$temp_100_anom>0.25)][i]), cex =0.8)
# }

# add all points
for(i in 1:length(hake_age2$temp_100_anom)){
text(x = hake_age2$temp_100_anom[i], y = hake_age2$age2_CA_US[i] + 1.8*lab_adj, as.character(hake_age2$year[i]), cex =0.8)
}

dev.off()

```


```{r}
# plot age 2 separately

dt_sub <- pred_df[which(pred_df$hake_sub=="age2"),]

pdf("figurepdfs/hake_fits_age2_years_labeled.pdf")
#par(mfrow = c(2, 2))
#par(mar=c(0.5, 1, 1, 0), oma = c(3, 2.5, 1, 0.5))
plot(x = hake_age2$temp_100_anom, y = hake_age2$age2_CA_US, xlab = NA, ylab = NA, pch = 16, col = adjustcolor("black", alpha.f = 0.75), las = 1, ylim = c(0, 0.1))
axis(side = 2, at = seq(0, 0.6, by = 0.2), labels = NA)
mtext(side = 3, "Age 2", line = -1.5)
mtext(side = 1, "Mean 100m temp anomaly between 45-49ºN", line = 0.5, outer = T)
mtext(side = 2, "Proportion biomass in Canada", line = 1.25, outer = T)
lines(x =hake_x, y = dt_sub$mn, lty = 2)
polygon(x = c(hake_x, rev(hake_x)), y = c(dt_sub$up, rev(dt_sub$down)), col = adjustcolor("gray20", alpha.f = 0.2), border = NA)

# for(i in 1:3){
# text(x = hake_age2$temp_100_anom[which(hake_age2$temp_100_anom>0.25)][i], y = hake_age2$age2_CA_US[which(hake_age2$temp_100_anom>0.25)][i] + lab_adj, as.character(hake_age2$year[which(hake_age2$temp_100_anom>0.25)][i]), cex =0.8)
# }

# add all points
for(i in 1:length(hake_age2$temp_100_anom)){

if(hake_age2$year[i] %in% c(1995, 1998, 2003, 2005, 2007, 2009, 2011, 2012, 2015, 2017, 2019)){
  text(x = hake_age2$temp_100_anom[i], y = hake_age2$age2_CA_US[i] + 0.004, as.character(hake_age2$year[i]), cex =0.8)
} else{
  text(x = hake_age2$temp_100_anom[i], y = hake_age2$age2_CA_US[i] + 0.002, as.character(hake_age2$year[i]), cex =0.8)
}

}

dev.off()

```


# simulations

simulate data that resembles the age 5-20 data set

## parameter choices


```{r}

#View(hake_age5_20)

# first fit gam and lm on raw scale to get parameter values for the "true" driver-response relationship in the simulations
gam_raw_5_20 <- gam(age5_20_CA_US~s(temp_100_anom,k=4,bs="tp"),data = hake_age5_20)
lm_raw_5_20 <- lm(age5_20_CA_US~temp_100_anom,data = hake_age5_20)

driver_hake <- seq(from = min(hake_age5_20$temp_100_anom), to = max(hake_age5_20$temp_100_anom), length.out = 100) # driver values

gam_raw_5_20_fits <- gratia::fitted_values(gam_raw_5_20, data = data.frame(temp_100_anom = driver_hake))$fitted # GAM predictions

lm_raw_5_20_fits <- stats::predict(lm_raw_5_20, newdata = data.frame(temp_100_anom = driver_hake), se.fit = F) # linear model predictions

# get the threshold estimate on original scale
#View(age5_20_thresh$jack_summ)
thresh_est_5_20 <- invzfun(age5_20_thresh$jack_summ$thresh_mean[which(age5_20_thresh$jack_summ$thresh_method=="abs_max_d2" & age5_20_thresh$jack_summ$sig_type=="none")], hake_age5_20$temp_100_anom) # 0.2208785

# estimate the true driver-response relationship, assuming it is nonlinear and a hockeystick shape
# get the mean of the flat (pre-threshold) part of the GAM prediction
hs_1 <- mean(gam_raw_5_20_fits[which(driver_hake<=thresh_est_5_20)]) # 0.253041

# get the equation of the line (y = mx + b) for the second part (past the threshold)
hs_2m <- (gam_raw_5_20_fits[length(gam_raw_5_20_fits)]-hs_1)/(driver_hake[length(driver_hake)]-max(driver_hake[which(driver_hake<=thresh_est_5_20)])) # slope: 0.6409229
hs_2b <- gam_raw_5_20_fits[length(gam_raw_5_20_fits)]-hs_2m*driver_hake[length(driver_hake)] # intercept: 0.1165395

# check this looks right
plot(x = driver_hake, y = ifelse(driver_hake < thresh_est_5_20, hs_1, hs_2m*driver_hake + hs_2b), type = "l", ylim = c(0, 1))
lines(x = hake_age5_20$temp_100_anom, y = hake_age5_20$age5_20_CA_US, type = "p")

# can simulate data using this relationship using sim_emp_dat function
# "hs_type" 1) increasing slope then flat or 2) decreasing slope then flat
# so here use the flat then slope option
# if flip==TRUE, change x to (-x + 2*thresh_loc) to flip it around the threshold location

# get the parameters for the equation of the line: just use the slope and intercept from the linear model
#lm_raw_5_20 # intercept = 0.29, slope = 0.16
ln_b_5_20 <- unname(lm_raw_5_20$coefficients[1])# 0.2874051
ln_m_5_20 <- unname(lm_raw_5_20$coefficients[2])# 0.1636895

# check this looks right
plot(x = driver_hake, y = ln_m_5_20*driver_hake + ln_b_5_20, type = "l", ylim = c(0, 1))
lines(x = hake_age5_20$temp_100_anom, y = hake_age5_20$age5_20_CA_US, type = "p")

# get the noise in the y values: approximate based on standard deviation of the residuals of the GAM and linear model fits
sd(residuals(lm_raw_5_20)) # 0.152
sd(residuals(gam_raw_5_20)) # 0.134

obs_sd_5_20 <- 0.14 # choose 0.14

# get the characteristics of the driver data
#hist(hake_age5_20$temp_100_anom, n = 8)

# assume driver has normal distribution
x_sd1 <- sd(hake_age5_20$temp_100_anom) # 0.3221409
x_mean1 <- mean(hake_age5_20$temp_100_anom) # 0

# temporal autocorrelation
#acf(hake_age5_20$temp_100_anom) # doesn't seem to be an issue

# threshold quantile
# get the quantile of the threshold value: need to find which value of the threshold quantile would correspond to a value of the mean of the driver's distribution that is approximately equal to the observed mean (assuming driver has a normal distribution)
quant_vals <- seq(from = 0.001, to = 0.999, by = 0.001) # quantile values to check
quant_test <- rep(NA, length(quant_vals)) # holding vector for corresponding driver values

for(i in 1: length(quant_test)){

  quant_test[i] <- qnorm(quant_vals[i], mean = 0, sd = x_sd1) # driver value corresponding to the ith quantile of a normal distribution with a mean of 0 and as sd equal to the observed sd value
}

# figure out which of the x values in quant_test is closest to the true threshold quantile value
# for true threshold quantile, x_mean = thresh_true - qnorm(thresh_quant), which means x_mean - (thresh_true - qnorm(thresh_quant)) = 0, so want to find the value of quant_test that minimizes this expression
which(abs(x_mean1-(thresh_est_5_20-quant_test))==min(abs(x_mean1-(thresh_est_5_20-quant_test)))) # 754

quant_vals[754] # 0.754

thresh_quant1 <- 0.754 # threshold quantile is approximately 0.754 (i.e., about 75% of observations fall below the threshold value)

# check want thresh_loc - qnorm(thresh_quant, mean = 0, sd = x_sd1) is close to the observed mean
thresh_est_5_20 - qnorm(thresh_quant1, mean = 0, sd = x_sd1)
x_mean1

```


put all the parameters together and check that the resulting simulated data sets look reasonable
```{r}

# driver parameters
driver_pars_5_20 = list(x_min = NULL, x_max = NULL, thresh_quant = thresh_quant1, x_df = 10, x_sd = x_sd1, x_dist = "normal")


# parameters for driver-response relationship
control_pars_5_20 <-  list(thresh_loc = thresh_est_5_20, thresh_loc_sd = 0, y_max = gam_raw_5_20_fits[length(gam_raw_5_20_fits)], y_min = hs_1, min_x =-driver_hake[length(driver_hake)]+2*thresh_est_5_20, hs_type = 2, hs_a = NULL, flip_y = TRUE, skew_cv = 1, skew_conc = "up", sig_k = 1.5, lin_m = ln_m_5_20, lin_b = ln_b_5_20)

# simulate a single test data set
sim_dt_5_20 <- sim_emp_data(nsim = 1, tmax = 13, fun = "hockeystick", control_pars = control_pars_5_20, driver_pars = driver_pars_5_20, obs_sd = obs_sd_5_20)

# compare to actual data set
#mean(hake_age5_20$temp_100_anom)
#sd(hake_age5_20$temp_100_anom)
#mean(sim_dt_5_20$driver)
#sd(sim_dt_5_20$driver)

#hist(sim_dt_5_20$driver)
#abline(v = thresh_est_5_20 - qnorm(driver_pars_5_20$thresh_quant, mean = 0, sd = x_sd1))

# see what the simulated data look like
plot(x = driver_hake, y = gam_raw_5_20_fits, type = "l", xlab = "driver", ylab = "response", xlim = c(min(sim_dt_5_20$driver), max(sim_dt_5_20$driver)), ylim = c(min(sim_dt_5_20$obs_response), max(sim_dt_5_20$obs_response)))
lines(x = driver_hake, y = lm_raw_5_20_fits, lty = 2)
abline(v = thresh_est_5_20)
#abline(h = hs_1, col = "blue")
#abline(a = hs_2b, b = hs_2m, col = "blue")
lines(x = sim_dt_5_20$driver[order(sim_dt_5_20$driver)], y = sim_dt_5_20$response[order(sim_dt_5_20$driver)], type = "l", col = "blue")
lines(x = sim_dt_5_20$driver, y = sim_dt_5_20$obs_response, type = "p", pch = 16)

#plot(x = sim_dt_5_20$driver, y = sim_dt_5_20$obs_response, type = "p")
#abline(v = thresh_est_5_20)

#length(which(sim_dt_5_20$driver>thresh_est_5_20))/length(sim_dt_5_20$driver)


# look at means more closely to make sure the threshold quantile choice is working
# simulate 100 data sets and get the mean value of the driver in each for each of them
sim_dt_test <- sim_emp_data(nsim = 100, tmax = 20, fun = "hockeystick", control_pars = control_pars_5_20, driver_pars = driver_pars_5_20, obs_sd = obs_sd_5_20)

sim_dt_test <- sim_dt_test %>% group_by(sim) %>% summarize(mean_driver = mean(driver))

hist(sim_dt_test$mean_driver)
abline(v = thresh_est_5_20 - qnorm(driver_pars_5_20$thresh_quant, mean = 0, sd = x_sd1))

# simulate a single data set with a time series length of 1000
sim_dt_test <- sim_emp_data(nsim = 1, tmax = 1000, fun = "hockeystick", control_pars = control_pars_5_20, driver_pars = driver_pars_5_20, obs_sd = obs_sd_5_20)

hist(sim_dt_test$driver)

```


```{r}
# check how often there are values less than 0 or greater than 1 (since the true response is a proportion and should therefore be between 0 and 1)
sim_dt_test2 <- sim_emp_data(nsim = 100, tmax = 50, fun = "hockeystick", control_pars = control_pars_5_20, driver_pars = driver_pars_5_20, obs_sd = obs_sd_5_20)

length(which(sim_dt_test2$obs_response > 1))/length(sim_dt_test2$sim)*100
length(which(sim_dt_test2$obs_response < 0))/length(sim_dt_test2$sim)*100

sim_dt_test3 <- sim_emp_data(nsim = 100, tmax = 50, fun = "linear", control_pars = control_pars_5_20, driver_pars = driver_pars_5_20, obs_sd = obs_sd_5_20)

length(which(sim_dt_test3$obs_response > 1))/length(sim_dt_test2$sim)*100
length(which(sim_dt_test3$obs_response < 0))/length(sim_dt_test2$sim)*100

```

## run simulations

Look at the effect of time series length and observation error on threshold detectability. Simulations take about 85 min for 100 replicates

```{r}
# parmat_h
nsim1 <- 100 # 100 replicate data sets

# driver values for generating predictions (on standardized scale)
xvals1 <- seq(from = -3, to = 3, by = 0.01)


obs_set_h <- c(obs_sd_5_20, 0.1*obs_sd_5_20)# sd's of observation error
ts_set_h <- seq(from = 13, to = 41, by = 4) # time series lengths
quant_set_h <- c(thresh_quant1)
cov_sd_set_h <- c(0) # no covariate effects

# get all the parameter combinations together
parmat_h1 <- unname(as.matrix(expand.grid(obs_set_h, ts_set_h, quant_set_h[1], cov_sd_set_h[1])))

parmat_h <- unique(parmat_h1) # make sure only unique combinations are used
#parmat_h

```


### hockeystick

simulations where true driver-response relationship is a hockeystick shape based on the GAM fit to the empirical age 5-20 data set

```{r}
# hockeystick simulations

# first simulate the data and save as a list
dt_list <- vector(mode = "list")

tictoc::tic()
for(p in 1:length(parmat_h[,1])){

  driver_pars_p = list(x_min = NULL, x_max = NULL, thresh_quant = parmat_h[p, 3], x_df = 10, x_sd = x_sd1, x_dist = "normal")

  cov_pars_p <- list(inc_cov = TRUE, beta_mean = 1, beta_sd = 0, beta_sign = 1, cov_mean = 0, cov_sd = parmat_h[p, 4], cov_int = NA)

  dt_p <- sim_emp_data(nsim = nsim1, tmax = parmat_h[p, 2], driver_pars = driver_pars_p, obs_sd = parmat_h[p, 1], fun = "hockeystick", control_pars = control_pars_5_20, cov_pars = cov_pars_p) # CHANGE fun and control_pars for each driver-response relationship

  # un-comment the two lines below to restrict simulated response variables to be between 0 and 1
  #dt_p$obs_response <- ifelse(dt_p$obs_response > 1, 1, ifelse(dt_p$obs_response < 0, 0, dt_p$obs_response))
  #dt_p$response <- ifelse(dt_p$response > 1, 1, ifelse(dt_p$response < 0, 0, dt_p$response))

  dt_list[[p]] <- dt_p

}
tictoc::toc()

tictoc::tic()
for(p in 1:length(parmat_h[,1])){
  # get the simulated data
  dt_p <- dt_list[[p]]

  # do the linearity test
  ln_p <- lin_check(dt_p, thresh_methods = c("abs_max_d2"))

  # add the parameter columns
  ln_p$obs_error <- rep(parmat_h[p, 1], length(ln_p$sim))
  ln_p$ts_length <- rep(parmat_h[p, 2], length(ln_p$sim))
  ln_p$thresh_quant <- rep(parmat_h[p, 3], length(ln_p$sim))
  ln_p$cov_sd <- rep(parmat_h[p, 4], length(ln_p$sim))

  # save the data frame
  if(p == 1){
    lndf <- ln_p
  } else {
    lndf <- rbind(ln_p, lndf)
  }


}
tictoc::toc()

# do the jackknifing
tictoc::tic()
for(p in 1:length(parmat_h[,1])){
  # get the simulated data
  dt_p <- dt_list[[p]]

  # do the jackknifing
  thresh_pj <- jack_thresh(dt_p, xvals1, thresh_methods = c("abs_max_d2"))

  # add the parameter columns
  thresh_pj$obs_error <- rep(parmat_h[p, 1], length(thresh_pj$sim))
  thresh_pj$ts_length <- rep(parmat_h[p, 2], length(thresh_pj$sim))
  thresh_pj$thresh_quant <- rep(parmat_h[p, 3], length(thresh_pj$sim))
  thresh_pj$cov_sd <- rep(parmat_h[p, 4], length(thresh_pj$sim))

  # save the data frame
  if(p == 1){
    jdf <- thresh_pj
  } else {
    jdf <- rbind(thresh_pj, jdf)
  }


}


tictoc::toc()

# previously saved jdf
#df_tmp <- read.csv("simulation outputs0/hake_sim2.csv")

#jdf <- df_tmp %>% filter(shape == "hockeystick") %>% select(-X, -shape, -best_mod, -aic_diff)

hs_jdf <- jdf
hs_jdf$shape <- rep("hockeystick", length(hs_jdf$sim))
hs_jdf$best_mod <- lndf$best_mod
hs_jdf$aic_diff <- lndf$aic_diff

#View(hs_jdf)

```


### linear

simulations where true driver-response relationship is a linear function based on the linear regression fit to the empirical age 5-20 data set

```{r}

# linear simulations

# first simulate the data and save as a list
dt_list <- vector(mode = "list")

tictoc::tic()
for(p in 1:length(parmat_h[,1])){

  driver_pars_p = list(x_min = NULL, x_max = NULL, thresh_quant = parmat_h[p, 3], x_df = 10, x_sd = x_sd1, x_dist = "normal")

  cov_pars_p <- list(inc_cov = TRUE, beta_mean = 1, beta_sd = 0, beta_sign = 1, cov_mean = 0, cov_sd = parmat_h[p, 4], cov_int = NA)

  dt_p <- sim_emp_data(nsim = nsim1, tmax = parmat_h[p, 2], driver_pars = driver_pars_p, obs_sd = parmat_h[p, 1], fun = "linear", control_pars = control_pars_5_20, cov_pars = cov_pars_p) # CHANGE fun and control_pars for each driver-response relationship

  # un-comment the two lines below to restrict simulated response variables to be between 0 and 1
  #dt_p$obs_response <- ifelse(dt_p$obs_response > 1, 1, ifelse(dt_p$obs_response < 0, 0, dt_p$obs_response))
  #dt_p$response <- ifelse(dt_p$response > 1, 1, ifelse(dt_p$response < 0, 0, dt_p$response))

  dt_list[[p]] <- dt_p

}
tictoc::toc()

tictoc::tic()
for(p in 1:length(parmat_h[,1])){
  # get the simulated data
  dt_p <- dt_list[[p]]

  # do the linearity test
  ln_p <- lin_check(dt_p, thresh_methods = c("abs_max_d2"))

  # add the parameter columns
  ln_p$obs_error <- rep(parmat_h[p, 1], length(ln_p$sim))
  ln_p$ts_length <- rep(parmat_h[p, 2], length(ln_p$sim))
  ln_p$thresh_quant <- rep(parmat_h[p, 3], length(ln_p$sim))
  ln_p$cov_sd <- rep(parmat_h[p, 4], length(ln_p$sim))

  # save the data frame
  if(p == 1){
    lndf <- ln_p
  } else {
    lndf <- rbind(ln_p, lndf)
  }


}
tictoc::toc()

# do the jackknifing
tictoc::tic()
for(p in 1:length(parmat_h[,1])){
  # get the simulated data
  dt_p <- dt_list[[p]]

  # do the jackknifing
  thresh_pj <- jack_thresh(dt_p, xvals1, thresh_methods = c("abs_max_d2"))

  # add the parameter columns
  thresh_pj$obs_error <- rep(parmat_h[p, 1], length(thresh_pj$sim))
  thresh_pj$ts_length <- rep(parmat_h[p, 2], length(thresh_pj$sim))
  thresh_pj$thresh_quant <- rep(parmat_h[p, 3], length(thresh_pj$sim))
  thresh_pj$cov_sd <- rep(parmat_h[p, 4], length(thresh_pj$sim))

  # save the data frame
  if(p == 1){
    jdf <- thresh_pj
  } else {
    jdf <- rbind(thresh_pj, jdf)
  }


}


tictoc::toc()

# previously saved jdf
#df_tmp <- read.csv("simulation outputs0/hake_sim2.csv")

#jdf <- df_tmp %>% filter(shape == "linear") %>% select(-X, -shape, -best_mod, -aic_diff)

ln_jdf <- jdf
ln_jdf$shape <- rep("linear", length(ln_jdf$sim))
ln_jdf$best_mod <- lndf$best_mod
ln_jdf$aic_diff <- lndf$aic_diff

```


### save results

```{r}
hake_sim_df <- rbind(hs_jdf, ln_jdf)

write.csv(hake_sim_df, "simulation outputs/hake_sim2.csv")

#View(hake_sim_df)

```


## plot results

### hake fits vs. sims

plot the GAM and linear regression fits to the age 5-20 hake data and overlay the approximate linear and nonlinear "true" driver-response relationships used in the above simulations

```{r}

pdf("figurepdfs/hake_sim_relationships.pdf", height = 4)
par(mfrow = c(1, 2))
par(mar=c(2, 0.5, 1, 0.5), oma = c(2, 3, 1, 0.5))
plot(x = hake_age5_20$temp_100_anom, y = hake_age5_20$age5_20_CA_US, xlab = NA, ylab = NA, las = 1, pch = 16)
lines(x = driver_hake, y = ifelse(driver_hake < thresh_est_5_20, hs_1, hs_2m*driver_hake + hs_2b), type = "l", lty = 2, lwd = 2)
lines(x = driver_hake, y = gam_raw_5_20_fits, type = "l")
#abline(v = thresh_est_5_20, col = "blue")
mtext(side = 1, "Mean 100m temp anomaly between 45-49ºN", line = 2.5, adj = -3.15)
mtext(side = 2, "Prop. age 5-20 hake biomass in CA", line = 2.5)
mtext(side = 3, "Nonlinear relationship")
legend(x = -0.5, y= 0.62, c("statistical fit", "simulated \nrelationship"), lwd = c(1, 2), lty = c(1, 2), bty = "n", cex = 0.8)

plot(x = hake_age5_20$temp_100_anom, y = hake_age5_20$age5_20_CA_US, xlab = NA, ylab = NA, las = 1, yaxt = "n", pch = 16)
axis(side = 2, at = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6), labels = NA)
lines(x = driver_hake, y = ln_m_5_20*driver_hake + ln_b_5_20, type = "l", lty = 2, lwd = 2)
lines(x = driver_hake, y = lm_raw_5_20_fits, type = "l")
#mtext(side = 1, "Mean 100m temp anomaly between 45-49ºN", line = 2.5)
mtext(side = 3, "Linear relationship")

dev.off()


```

### ROC, 100 reps

make reciever-operator curve plot using all 100 replicate data sets for each parameter combination


prep the data to plot:
```{r}
# ROC plots

# strict detection criteria

min_frac <- 0.95 # min fraction of jackknife iterations that need to have detected a threshold for a positive detection

# sig_type = sim_int (simultaneous interval significance criteria)
data_sub <- hake_sim_df %>% filter(sig_type=="sim_int")%>% mutate(thresh_n = if_else(best_mod %in% c("lm", "lm_ns", "gam_ns"), 0, thresh_n))%>% mutate(thresh_fraction = thresh_n/ts_length) %>% mutate(thresh_fraction = if_else(thresh_n_full %in% c(NA, 1), thresh_fraction, 0))

#View(data_sub)

# false positives
fpr_df <- data_sub %>% filter(shape=="linear") %>% mutate(n_sims = if_else(thresh_fraction >= min_frac, 1, 0)) %>% group_by(thresh_method, obs_error, ts_length, thresh_quant, cov_sd) %>% summarize(FPR = sum(n_sims)/100) #/100 because 100 replicate simulations

# true positives
tpr_df <- data_sub %>% filter(shape !="linear") %>% mutate(n_sims = if_else(thresh_fraction >= min_frac, 1, 0)) %>% group_by(shape, thresh_method, obs_error, ts_length, thresh_quant, cov_sd) %>% summarize(TPR = sum(n_sims)/100)

#View(tpr_df)

# join together
roc_df_strict <- left_join(tpr_df, fpr_df, by = c("thresh_method", "obs_error", "ts_length", "thresh_quant", "cov_sd")) %>% mutate(sig_criteria = "strict")

#View(roc_df_strict)


# repeat for weak detection criteria
min_frac <- 0.25# min fraction of jackknife iterations that need to have detected a threshold for a positive detection

# sig_type = "none" (no significance criteria)
data_sub <- hake_sim_df %>% filter(sig_type=="none")%>% mutate(thresh_n = if_else(best_mod %in% c("lm", "lm_ns", "gam_ns"), 0, thresh_n))%>% mutate(thresh_fraction = thresh_n/ts_length) %>% mutate(thresh_fraction = if_else(thresh_n_full %in% c(NA, 1), thresh_fraction, 0))

#View(data_sub)

fpr_df <- data_sub %>% filter(shape=="linear") %>% mutate(n_sims = if_else(thresh_fraction >= min_frac, 1, 0)) %>% group_by(thresh_method, obs_error, ts_length, thresh_quant, cov_sd) %>% summarize(FPR = sum(n_sims)/100)

#View(fpr_df)

tpr_df <- data_sub %>% filter(shape !="linear") %>% mutate(n_sims = if_else(thresh_fraction >= min_frac, 1, 0)) %>% group_by(shape, thresh_method, obs_error, ts_length, thresh_quant, cov_sd) %>% summarize(TPR = sum(n_sims)/100)

#View(tpr_df)

roc_df_weak <- left_join(tpr_df, fpr_df, by = c("thresh_method", "obs_error", "ts_length", "thresh_quant", "cov_sd")) %>% mutate(sig_criteria = "weak")

#View(roc_df_weak)

# join the weak and strict detection cases together
roc_df <- rbind(roc_df_strict, roc_df_weak)

#View(roc_df)

#head(data_sub)

```


make the plot:
```{r}

pt_cols <- c("blue", "cadetblue") # colors for low vs. high obs error
pt_pch <- c(0, 1, 2, 5, 6, 15, 16, 17) # point types from shortest to longest ts


pdf("figurepdfs/hake_sim_ROC.pdf", width = 7, height = 3.5)
layout(matrix(c(1,2), 1, 2))
par(mar=c(0, 1, 0.5, 0), oma = c(3, 3, 1, 0.5))

plot_df <- roc_df[which(roc_df$sig_criteria=="weak"),]
dt_sub <- plot_df %>% arrange(obs_error, ts_length)

plot(x = 0, y = 0, ylim = c(0, 1), xlim = c(0, 1), type = "n", las = 1, xlab = NA, ylab = NA, xaxt = "n", yaxt = "n")
axis(side = 1, at = seq(from = 0, to = 1, by = 0.2))
axis(side = 2, at = seq(from = 0, to = 1, by = 0.2), las = 1)
mtext(side = 3, "Weak detection requirements")
mtext(side = 2, "True positive rate", line = 2.5)
mtext(side = 1, "False positive rate", line = 2)
#mtext(side = 3, "Linear covariate", line = 0)
abline(a = 0, b = 1, lty = 2)
for(i in 1:length(obs_set_h)){

for(j in 1:length(ts_set_h)){

 # print(paste(pt_cols[i], pt_pch[j], sep = ","))
points(x = dt_sub$FPR[which(as.character(dt_sub$obs_error)==as.character(obs_set_h[i]) & dt_sub$ts_length==ts_set_h[j])], y = dt_sub$TPR[which(as.character(dt_sub$obs_error)==as.character(obs_set_h[i]) & dt_sub$ts_length==ts_set_h[j])], pch = pt_pch[j], col = pt_cols[i]) #adjustcolor(pt_cols[i], alpha.f = pt_alpha[j])
}
}
# put circle around current conditions
points(x = dt_sub$FPR[which(dt_sub$obs_error==0.14 & dt_sub$ts_length==13)], y = dt_sub$TPR[which(dt_sub$obs_error==0.14 & dt_sub$ts_length==13)], pch = 1, col = "black", cex = 2.4)

# legend background
legend("bottomright", legend = as.character(ts_set_h), col = "gray30", pch = pt_pch, title = "Time series \nlength", bg = "white", ncol = 2, bty = "n")
#legend("bottomright", legend = as.character(ts_set_h), col = "gray30", pch = pt_pch, title = "ts length", bg = "white", ncol = 2, bty = "n")
#legend("bottom", legend = c("0.014", "0.14"), col = c("cadetblue", "blue"), title = "obs_sd", bg = "white", pch = 16, bty = "n")
legend("bottom", legend = c("0.014", "0.14"), col = c("cadetblue", "blue"), title = "Obs. \nerror", bg = "white", pch = 16, bty = "n")


plot_df <- roc_df[which(roc_df$sig_criteria=="strict"),]
dt_sub <- plot_df %>% arrange(obs_error, ts_length)

plot(x = 0, y = 0, ylim = c(0, 1), xlim = c(0, 1), type = "n", las = 1, xlab = NA, ylab = NA, xaxt = "n", yaxt = "n")
axis(side = 1, at = seq(from = 0, to = 1, by = 0.2))
axis(side = 2, at = seq(from = 0, to = 1, by = 0.2), labels = NA)
mtext(side = 3, "Strict detection requirements")
#mtext(side = 2, "True positive rate", line = 2.5)
mtext(side = 1, "False positive rate", line = 2)
#mtext(side = 3, "Linear covariate", line = 0)
abline(a = 0, b = 1, lty = 2)
for(i in 1:length(obs_set_h)){

for(j in 1:length(ts_set_h)){

 # print(paste(pt_cols[i], pt_pch[j], sep = ","))
points(x = dt_sub$FPR[which(as.character(dt_sub$obs_error)==as.character(obs_set_h[i]) & dt_sub$ts_length==ts_set_h[j])], y = dt_sub$TPR[which(as.character(dt_sub$obs_error)==as.character(obs_set_h[i]) & dt_sub$ts_length==ts_set_h[j])], pch = pt_pch[j], col = pt_cols[i]) #adjustcolor(pt_cols[i], alpha.f = pt_alpha[j])
}
}

# put circle around current conditions
points(x = dt_sub$FPR[which(dt_sub$obs_error==0.14 & dt_sub$ts_length==13)], y = dt_sub$TPR[which(dt_sub$obs_error==0.14 & dt_sub$ts_length==13)], pch = 1, col = "black", cex = 2.4)

dev.off()


```

### ROC, 50 reps

repeat the above but only using the first 50 simulation replicates (to see whether patterns changed a lot between 50 and 100 reps, which would suggest 100 replicates might not be sufficient)

```{r}

# strict detection criteria

min_frac <- 0.95

# with >2 requirement
data_sub <- hake_sim_df %>% filter(sim <=50) %>% filter(sig_type=="sim_int")%>% mutate(thresh_n = if_else(best_mod %in% c("lm", "lm_ns", "gam_ns"), 0, thresh_n))%>% mutate(thresh_fraction = thresh_n/ts_length) %>% mutate(thresh_fraction = if_else(thresh_n_full %in% c(NA, 1), thresh_fraction, 0))

#View(data_sub)

fpr_df <- data_sub %>% filter(shape=="linear") %>% mutate(n_sims = if_else(thresh_fraction >= min_frac, 1, 0)) %>% group_by(thresh_method, obs_error, ts_length, thresh_quant, cov_sd) %>% summarize(FPR = sum(n_sims)/50)

#View(fpr_df)

tpr_df <- data_sub %>% filter(shape !="linear") %>% mutate(n_sims = if_else(thresh_fraction >= min_frac, 1, 0)) %>% group_by(shape, thresh_method, obs_error, ts_length, thresh_quant, cov_sd) %>% summarize(TPR = sum(n_sims)/50)

#View(tpr_df)

roc_df_strict <- left_join(tpr_df, fpr_df, by = c("thresh_method", "obs_error", "ts_length", "thresh_quant", "cov_sd")) %>% mutate(sig_criteria = "strict")

#View(roc_df_strict)


# weak detection criteria
min_frac <- 0.25
data_sub <- hake_sim_df %>% filter(sim <=50) %>% filter(sig_type=="none")%>% mutate(thresh_n = if_else(best_mod %in% c("lm", "lm_ns", "gam_ns"), 0, thresh_n))%>% mutate(thresh_fraction = thresh_n/ts_length) %>% mutate(thresh_fraction = if_else(thresh_n_full %in% c(NA, 1), thresh_fraction, 0))