Global process

Prepare data

Get W data about wood occurrences

wpath="../data-raw/mem_data/woody_data_train/"
mystation="V2942010"
fs::dir_tree(wpath)
## ../data-raw/mem_data/woody_data_train/
## ├── event_1
## │   ├── Wood Log 20071123 09h56m.txt
## │   ├── Wood Log 20071123 10h11m.txt
## │   ├── Wood Log 20071123 10h56m.txt
## │   ├── Wood Log 20071123 11h56m.txt
## │   └── Wood Log 20071123 12h56m.txt
## └── event_2
##     ├── Wood Log 20071123 13h56m.txt
##     ├── Wood Log 20071123 14h26m.txt
##     ├── Wood Log 20071123 14h56m.txt
##     ├── Wood Log 20071123 15h56m.txt
##     └── Wood Log 20071123 16h50m.txt
library(woody)

Import all wood log files inside a file (1 directory per event)

W=import_Wdata(wpath,site="training") 

Get Q data about discharge

Import Q:

qpath="../data-raw/Qdata/Qdatc_Ain.csv"
Q=import_Qdata(path=qpath, site="Ain")
##  Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## Rows: 86528 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr  (2): site, station
## dbl  (4): Q, T_Q, S, rT_Q
## dttm (1): Time
## 
##  Use `spec()` to retrieve the full column specification for this data.
##  Specify the column types or set `show_col_types = FALSE` to quiet this message.

The first lines of Q look like this

station Time Q site
V2942010 2001-01-01 00:00:00 0.1345238 Ain
V2942010 2001-01-01 03:00:00 0.1369048 Ain
V2942010 2001-01-01 04:48:00 0.1333333 Ain
V2942010 2001-01-01 05:20:00 0.1273810 Ain
V2942010 2001-01-01 06:06:00 0.1188095 Ain
V2942010 2001-01-01 07:18:00 0.1095238 Ain

Complete Q into Qc

Now complete Q with variables T_Q and S

result_file="../data-raw/results/Q.RDS"
if(!file.exists(result_file)){
  Qc=complete_Qdata(qtvar=Q,site="Ain")
  saveRDS(Qc,result_file)
}
Qc=readRDS(result_file)

The first lines of Qc look like this:

site station Time Q T_Q S rT_Q
Ain V2942010 2001-01-01 00:00:00 0.1345238 0.000 NA 0.0000000
Ain V2942010 2001-01-01 03:00:00 0.1369048 0.125 0.0000661 0.3535534
Ain V2942010 2001-01-01 04:48:00 0.1333333 0.000 -0.0001653 0.0000000
Ain V2942010 2001-01-01 05:20:00 0.1273810 0.000 -0.0009301 0.0000000
Ain V2942010 2001-01-01 06:06:00 0.1188095 0.000 -0.0009317 0.0000000
Ain V2942010 2001-01-01 07:18:00 0.1095238 0.000 -0.0006448 0.0000000

Complete W with Q and get Wc

Complete W with discharge data (interpolating Q)

Wc=complete_Wdata_with_Qdata(Wdata=W,
                             Qdata=Qc)

The first lines of Wc look like this:

site event sitevent Time Length Date Q T_Q S rT_Q
training event_1 training_event_1 2007-11-23 08:57:01 1.41 2007-11-23 0.4970903 135.0343 0.0025741 11.62042
training event_1 training_event_1 2007-11-23 08:57:06 0.86 2007-11-23 0.4971250 135.0347 0.0025726 11.62044
training event_1 training_event_1 2007-11-23 08:57:14 0.55 2007-11-23 0.4971806 135.0354 0.0025702 11.62047
training event_1 training_event_1 2007-11-23 08:57:15 0.58 2007-11-23 0.4971875 135.0355 0.0025699 11.62048
training event_1 training_event_1 2007-11-23 08:57:25 2.47 2007-11-23 0.4972569 135.0364 0.0025668 11.62051
training event_1 training_event_1 2007-11-23 08:57:30 3.00 2007-11-23 0.4972917 135.0368 0.0025653 11.62053

Transform Wc into Wwt

Then calculate Wwt, where 1 row= 1 waiting time between two wood occurrences.

The first lines of Wwt look like this:

site event sitevent Time Length Date Q T_Q S rT_Q TimeLagged W npieces num Y
training event_1 training_event_1 2007-11-23 08:57:06 0.86 2007-11-23 0.4971250 135.0347 0.0025726 11.62044 2007-11-23 08:57:01 5 1 1 6.579251
training event_1 training_event_1 2007-11-23 08:57:14 0.55 2007-11-23 0.4971806 135.0354 0.0025702 11.62047 2007-11-23 08:57:06 8 1 1 6.109248
training event_1 training_event_1 2007-11-23 08:57:15 0.58 2007-11-23 0.4971875 135.0355 0.0025699 11.62048 2007-11-23 08:57:14 1 1 1 8.188689
training event_1 training_event_1 2007-11-23 08:57:25 2.47 2007-11-23 0.4972569 135.0364 0.0025668 11.62051 2007-11-23 08:57:15 10 1 1 5.886104
training event_1 training_event_1 2007-11-23 08:57:30 3.00 2007-11-23 0.4972917 135.0368 0.0025653 11.62053 2007-11-23 08:57:25 5 1 1 6.579251
training event_1 training_event_1 2007-11-23 08:57:32 0.82 2007-11-23 0.4973056 135.0370 0.0025647 11.62054 2007-11-23 08:57:30 2 1 1 7.495542

Run random forest

myrf <- run_rf(Wwt)
myrf
## 
## Call:
##  randomForest(formula = Y ~ ., data = Wdata_rf, ntree = 1000) 
##                Type of random forest: regression
##                      Number of trees: 1000
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 1.265999
##                     % Var explained: -15.74

Get predictions and assess random forest performance

On the training dataset itself

Predict on training data itself

Wp=predict_rf(obj_rf=myrf,
              newdata=Wwt)

R2:

## # A tibble: 1 × 3
##     SCR   SCT    R2
##   <dbl> <dbl> <dbl>
## 1  798. 2273. 0.649

On another dataset

wpathpred="../data-raw/mem_data/woody_data_predict/"

Wwt_test=import_Wdata(wpathpred) %>%
  complete_Wdata_with_Qdata(Qc) %>% 
  Wdata_as_waiting_times() 

Wp_test=predict_rf(Wwt_test,myrf)
calc_rf_R2(Wp_test)
## # A tibble: 1 × 3
##     SCR   SCT    R2
##   <dbl> <dbl> <dbl>
## 1  500.  154. -2.24