library(nlme) library(mgcv) library(MuMIn) library(multcomp) #library(car) #source("~/Documents/2015pCO2paper/UniversalRCode/multcompToGraph.R") #source("~/Documents/2015pCO2paper/UniversalRCode/nlmeDiagPlots.R") #source("~/Documents/2015pCO2paper/UniversalRCode/qqnormWithBounds.R") # Reads in Japan data and reformats as needed ## the necessary data order is ##'Expocode,longitude,latitude,yr,mon,day,hh,mm,sst,sss,fCO2rec,PPPP,Requ,Tequ,ETOPO2_depth' read.in.data <- function(the.string) { raw.data <- read.csv(the.string) raw.data <- raw.data[,1:11] the.data <- na.omit(raw.data) #the.data <- the.data[the.data$fCO2rec<500,] # remove outliers per Hongjie the.data <- the.data[the.data$sss>30,] # remove outliers per Hongjie colnames(the.data)[1] <- "cruise" # c.date is character date # a.date is in Date format (days after Jan 1, 1970) # c.time is character time (24 hours) # p.date is Posix format time, seconds after midnight, Jan 1, 1970, zulu time # not sure if original data in zulu time or the.data time # n.date is p.date converted to numeric format # m.date is numeric minutes after midnight, Jan 1, 1970, zulu time # doy is day of the year, with doy=1 on Jan 1 the.data$c.date <- paste(the.data$yr, the.data$mon, the.data$day, sep="-") the.data$a.date <- as.Date(the.data$c.date,format="%Y-%m-%d") the.data$c.time <- paste(the.data$hh, the.data$mm, sep=":") the.data$p.date <- as.POSIXct(paste(the.data$c.date, the.data$c.time),format="%Y-%m-%d %H:%M") the.data$n.date <- as.numeric(the.data$p.date) the.data$m.date <- round(the.data$n.date/60) # Hah! 8 times are regarded as invalid because they occur during the shift forward # in daylight savings time. For now, we'll remove them: the.data <- na.omit(the.data) the.data$doy <- 0 for (a.yr in unique(the.data$yr)) { jan.1 <- as.Date(paste(a.yr,1,1,sep="-"), format="%Y-%m-%d") the.data$doy[the.data$yr==a.yr] <- the.data$a.date[the.data$yr==a.yr] - jan.1 + 1 } the.data <- the.data[order(the.data$cruise, the.data$n.date),] the.data$f.mon <- factor(the.data$mon) the.data$yr.mon <- round(the.data$yr + the.data$mon/12 - 1/24,2) the.data$f.yr.mon <- factor(the.data$yr.mon) # try subdividing cruises if gap > 180 minutes c.cruise <- as.character(the.data$cruise) for (u in unique(c.cruise)) { inc.ctr <- 1 cruise.names <- rep(u, sum(c.cruise==u)) dt <- diff(the.data$m.date[c.cruise==u]) big.gaps <- (1:length(dt))[dt>180] if (length(big.gaps) > 0) { for (k in 1:length(big.gaps)) { inc.ctr <- inc.ctr+1 cruise.names[(big.gaps[k]+1):length(cruise.names)] <- paste(u,inc.ctr,sep="_") } } c.cruise[c.cruise==u] <- cruise.names } #try subdividing cruises into subcruises of length 25 or less for (u in unique(c.cruise)) { inc.ctr <- 1 l <- sum(c.cruise==u) cruise.names <- rep(u, l) cruise.ctr <- 1 while (l > 49) { inc.ctr <- inc.ctr+1 cruise.names[cruise.ctr:length(cruise.names)] <- paste(u, inc.ctr, sep="s") cruise.ctr <- cruise.ctr + 25; l <- l-25 } if (l > 25) { inc.ctr <- inc.ctr+1 cruise.names[cruise.ctr:length(cruise.names)] <- paste(u, inc.ctr, sep="s") cruise.ctr <- cruise.ctr + round(l/2) inc.ctr <- inc.ctr+1 cruise.names[cruise.ctr:length(cruise.names)] <- paste(u, inc.ctr, sep="s") } c.cruise[c.cruise==u] <- cruise.names } the.data$cruise <- factor(c.cruise) the.data <- the.data[order(the.data$cruise, the.data$n.date),] # some datasets have multiple measurements in the same minute; average them unique.id <- factor(paste(the.data$cruise, the.data$m.date)) the.data <- gsummary(the.data, groups=unique.id) return(the.data) } opt.fn <- function(params, y, dt) { phi <- exp(params[1]); sigma <- exp(params[2]); n <- length(y) diffs <- y[2:n]-(phi^dt)*y[1:(n-1)] # print(diffs[1:10]) # print(c(phi, sigma)) return(-sum(dnorm(diffs, 0, sigma, log=T))) } fit.one.cruise <- function(the.data, the.cruise) { y <- the.data$fCO2rec[the.data$cruise==the.cruise] t <- the.data$m.date[the.data$cruise==the.cruise] dt <- diff(t) bang <- optim(c(-0.01, 0), opt.fn, y=y-mean(y), dt=dt, lower=c(-Inf,-Inf), upper=c(-1e-6,6), method="L-BFGS-B") return(exp(bang$par)) } examine.cruises <- function (the.data, t0.model) { n.cruises <- length(levels(the.data$cruise)) cruise.summary <- matrix(0, n.cruises, 11) colnames(cruise.summary) <- c("mean.deviation.from.T0", "sd.dev", "n.obs", "total.time", "corExp.range", "cor.p.value", "dt", "rho", "exp.rho", "fit.rho", "fit.sigma") rownames(cruise.summary) <- levels(the.data$cruise) big.predict <- predict(t0.model$model, newdata=data.frame(times=the.data$yr.mon)) big.predict <- big.predict - mean(the.data$fCO2rec) big.predict <- big.predict + t0.model$monthly.df$decycled.fco2[the.data$mon] for (i in 1:n.cruises) { cr <- levels(the.data$cruise)[i] #print(cr) x <- the.data$fCO2rec[the.data$cruise==cr] n <- length(x) delta.t <- diff(the.data$m.date[the.data$cruise==cr]) devs <- x - big.predict[the.data$cruise==cr] cruise.summary[i,1] <- mean(devs) cruise.summary[i,3] <- sum(the.data$cruise==cr) cruise.summary[i,4] <- diff(range(the.data$m.date[the.data$cruise==cr])) if (cruise.summary[i,3] > 5) { slack <- gls(fCO2rec~1, data=the.data[the.data$cruise==cr,]) whack <- gls(fCO2rec~1, corr=corExp(value=1000,form=~m.date), data=the.data[the.data$cruise==cr,], control=glsControl(returnObject=T, opt="optim")) cruise.summary[i,2] <- summary(whack)$sigma cruise.summary[i,5] <- exp(whack$modelStruct$corStruct[1]) cruise.summary[i,6] <- anova(slack, whack)[2,9] if (diff(range(diff(the.data$m.date[the.data$cruise==cr])))==0) { # evenly spaced in time crack <- gls(fCO2rec~1, corr=corAR1(value=0.9,form=~1|cruise), data=the.data[the.data$cruise==cr,], control=glsControl(returnObject=T, opt="optim")) cruise.summary[i,7] <- diff(the.data$m.date[the.data$cruise==cr])[1] rhoish <- crack$modelStruct$corStruct[1] cruise.summary[i,8] <- exp(rhoish)/(1+exp(rhoish))*2-1 cruise.summary[i,9] <- exp(-cruise.summary[i,7]/cruise.summary[i,5]) } cruise.summary[i, 10:11] <- fit.one.cruise(the.data, cr) } if (25*trunc(i/25)==i) {print(i)} } return(cruise.summary) } gen.cruise.resids <- function(the.data) { resid.out <- numeric(nrow(the.data)) for (l in levels(the.data$cruise)) { if (length(the.data$fCO2rec[the.data$cruise==l]) > 5) { ar.model <- ar(the.data$fCO2rec[the.data$cruise==l]) resid.out[the.data$cruise==l] <- ar.model$resid # includes NA's } } return(resid.out) } # Fit a Takahashi "original" model on some subset of a dataset ## Step 1: Determine which 4 years have the densest data bfy <- function(the.data) { y.m <- sort(unique(the.data$yr.mon)) n.mos <- numeric(length(y.m)) for (i in 1:length(y.m)) { n.mos[i] <- length(y.m[y.m >= y.m[i] & y.m < y.m[i]+4]) } decent.choices <- y.m[n.mos>0.9*max(n.mos)] dist.from.med <- abs(decent.choices-median(y.m)) possible.starts <- sort(decent.choices[dist.from.med==min(dist.from.med)]) return(possible.starts[1]) # (in case there's a tie, return the earliest) } bfy.all3 <- function(the.data) { y.m <- sort(unique(the.data$yr.mon)) n.mos <- numeric(length(y.m)) for (i in 1:length(y.m)) { n.mos[i] <- length(y.m[y.m >= y.m[i] & y.m < y.m[i]+4]) } decent.choices <- y.m[n.mos>0.9*max(n.mos)] return(decent.choices) } ## Step 2: Find monthly means for the dataset. To account for possible missing ## months at the start or end of the year, we extend the "year" in each direction. fmm <- function(the.data, start.time) { the.data <- the.data[the.data$yr.mon >= start.time & the.data$yr.mon < (start.time+4),] the.data$f.mon <- factor(the.data$mon) monthly.means <- tapply(the.data$fCO2rec, the.data$f.mon, mean) months <- as.numeric(names(monthly.means)) month.df <- data.frame(months=months, means=as.numeric(monthly.means)) # There's a small3 chall3enge to interpolating if one of the missing months is # December or January, so extend the means at each end month.df <- rbind(month.df[nrow(month.df),], month.df, month.df[1,]) month.df$months[1] <- month.df$months[1] - 12 month.df$months[nrow(month.df)] <- month.df$months[nrow(month.df)] + 12 mm.out <- numeric(12) for (i in 1:12) { if (i %in% month.df$months) { # then we have that month in the data mm.out[i] <- month.df$means[month.df$months==i] } else { # we have to interpolate # find months on either side of the current month that we have data for j <- i-1; while (!(j %in% month.df$months)) {j <- j-1} k <- i+1; while (!(k %in% month.df$months)) {k <- k+1} mm.out[i] <- month.df$means[month.df$months==j] + (i-j)/(k-j)*(month.df$means[month.df$months==k] - month.df$means[month.df$months==j]) } } return(mm.out) } ## Step 3: Decycle data and reduce to monthly decycle.data <- function(the.data, start.time, monthly.means) { short.data <- the.data[the.data$yr.mon >= start.time & the.data$yr.mon < (start.time+4),] four.yr.mean <- mean(short.data$fCO2rec) the.data$adjustment <- four.yr.mean-monthly.means[the.data$mon] the.data$fco2adj <- the.data$fCO2rec+the.data$adjustment decycled.fco2 <- tapply(the.data$fco2adj, the.data$f.yr.mon, mean) return(data.frame(decycled.fco2 = tapply(the.data$fco2adj, the.data$f.yr.mon, mean), times = as.numeric(names(decycled.fco2)))) } ## Step 4: Function collecting the above and fitting the model: T0.model <- function(the.data, start.time=NA, monthly.means=NA) { if (is.na(start.time)) {start.time <- bfy(the.data)} if (is.na(monthly.means)) {monthly.means<- fmm(the.data, start.time)} monthly.df <- decycle.data(the.data, start.time, monthly.means) return(list(monthly.df = monthly.df, model = lm(decycled.fco2~times, data=monthly.df))) } # convenience function for fitting an annual spline annual.spline <- function(the.data) { return(gam(fCO2rec~s(doy,bs="cp",k=12), knots=list(doy=seq(1,365,length=13)), corr=corCompSymm(form=~1|c.date),data=the.data)) } # convenience function for fitting a W4 model W4.model <- function(the.data) { return(gamm(fCO2rec~s(doy,bs="cp",k=12) + sss*sst + I(sst^2) + I(sss^2) + a.date, knots=list(doy=seq(1,365,length=13)), weights=varComb(varExp(form=~sss/30)), corr=corExp(form=~m.date|cruise), data=the.data, control=list(returnObject=TRUE, opt="optim"))) } ############ The final code of W4 method this paper used poly.W4.model.ar1 <- function(the.data) { return(gamm(fCO2rec~s(doy,bs="cp",k=12) + sss*sst + I(sst^2) + a.date, knots=list(doy=seq(1,366,length=13)), weights=varComb(varExp(form=~sss-30),varExp(form=~sst),varExp(form=~as.numeric(a.date)/13250)), random=list(cruise=~1), corr=corAR1(value=0.995, form=~m.date|cruise), data=the.data)) } ############ The W3 method this paper used without removing the heteroscedasticity poly.W4.model.hete <- function(the.data) { return(gamm(fCO2rec~s(doy,bs="cp",k=12) + sss*sst + I(sst^2) + a.date, knots=list(doy=seq(1,366,length=13)), random=list(cruise=~1), corr=corAR1(value=0.995, form=~m.date|cruise), data=the.data)) } plot.sss.sst <- function(poly.model) { sst.seq <- seq(5, 31, 0.1) sss.seq <- seq(30, 35.5, 0.025) z <- matrix(0, length(sst.seq), length(sss.seq)) pt <- summary(poly.model$gam)$p.table for (i in 1:length(sst.seq)) {for (j in 1:length(sss.seq)) {z[i,j] <- pt[2,1]*sss.seq[j] + pt[3,1]*sst.seq[i] + pt[4,1]*sst.seq[i]^2 + pt[6,1]*sss.seq[j]*sst.seq[i]}} contour(sst.seq, sss.seq, z, xlab="SST", ylab="SSS") points(sss~sst, data=Japan, pch=16, cex=0.25, col="red") } correct.AIC <- function(gamm.model) { return(-2*logLik(gamm.model)+2*attr(logLik(gamm.model$gam), "df") + 2*attr(logLik(gamm.model$lme), "df")) } test=read.in.data("V3Japan_lat34long140.csv") ## reading the test grid T0.model.1<-T0.model(test) ##T0 method W4.model.1<-poly.W4.model.ar1(test) ##W4 method ###writing into csv file with proper output test.results.mx <- matrix(0, 1, 11) colnames(test.results.mx) <- c("min_long", "max_long","min_lati","max_lati",'p_W4','W4.trend','number.of.points','W4.error', 'BP SSS','BP SST','BP date') test.results.mx[1,1] <- min(test[,2]) #min_longitude test.results.mx[1,2] <- max(test[,2]) #max_longitude test.results.mx[1,3] <- min(test[,3]) #min_latitude test.results.mx[1,4] <- max(test[,3]) #max_latitude test.results.mx[1,5] <- summary.gam(W4.model.1$gam)$p.table[5,4] ### Pvalue for W4 method test.results.mx[1,6] <- coef(W4.model.1$gam)[5]*365.25 ### trend derived by W4 method test.results.mx[1,7] <- dim(test)[1] ### number of points in this grid test.results.mx[1,8] <- summary.gam(W4.model.1$gam)$p.table[5,2]*365.25 ### W4 standard error r<-resid(W4.model.1$lme,type="normalized") test.results.mx[1,9] <- bptest(r~test[,10])$p.value ### BP test Pvalue for SSS test.results.mx[1,10] <- bptest(r~test[,9])$p.value ### BP test Pvalue for SST test.results.mx[1,11] <- bptest(r~test[,16])$p.value ### BP test Pvalue for Sampling date write.table(test.results.mx,'TestGridOutput.csv',sep=",", row.names=F)