# Packages ---- library(ggplot2) library(data.table) library(gridExtra) setwd("G:/My Drive/Thesis - Synced/Full Thesis/Results") # Simulation ---- sim_path = function(spp, ext, D_start = 1, t_max = 100, D_max = 10^8, i_max = 10^4){ t = 0 D = D_start i = 0 record = data.frame(t = t, D = D) while (t < t_max && D > 0 && D < D_max && i <= i_max){ i = i + 1 t_spp = rexp(1, D*spp) t_ext = rexp(1, D*ext) if (t_spp < t_ext){ t = t + t_spp D = D + 1 } else{ t = t + t_ext D = D - 1 } step = data.frame(t = t, D = D) record = rbind(record, step) } record } sim_data = function(spp, ext, frame = data.frame(), t_start = 0, t_max = 100, D_max = 10^8){ t = t_start t_ext = rexp(1, ext) t_end = t_start + t_ext times = data.frame(start = t_start, end = t_end) frame = rbind(frame, times) t_spp = rexp(1, spp) t = t + t_spp while (t < min(t_max,t_end)){ frame = rbind(frame, sim_data(spp, ext, t_start=t, t_max=t_max, D_max = D_max)) t_spp = rexp(1, spp) t = t + t_spp } return(frame) } process_series = function(series){ len = dim(series)[1] dt = series$t[2:len] - series$t[1:(len-1)] dD = series$D[2:len] - series$D[1:(len-1)] type = dD == 1 series = data.frame(t = series$t[2:len], D = series$D[1:(len-1)], dt = dt, dD = dD, type = type) return(series) } div_t = function(series,time,verbose = TRUE){ len = nrow(series) if (time < series$t[1]){ if(verbose){print("Time too early")} return(NaN) } if (time > series$t[len]){ if (series$D[len] == 0){ return(0) } else{ if(verbose){print("Time out of bounds")} return(NaN) } } return(div_t_sub(series,time)) } div_t_sub = function(series,time){ len = nrow(series) i_mid_l = len %/% 2 t_check = series$t[i_mid_l:(i_mid_l+1)] if (t_check[1] <= time & t_check[2] >= time){ return(series$D[i_mid_l]) } else{ if (t_check[1] > time){ return(div_t_sub(series[1:i_mid_l,],time)) } else { return(div_t_sub(series[(i_mid_l+1):len,],time)) } } } discretize = function(series, binsize = 1, t_min = 0, t_max = NULL){ len = nrow(series) data = data.frame(t = c(), D = c()) if (is.null(t_max)){ t_max = series$t[len] } for (time in seq(t_min,t_max+binsize,binsize)){ slice = data.frame(t = time, D = div_t(series,time,FALSE)) data = rbind(data, slice) } return(data) } discrete_sims = function(n, spp, ext, binsize = 1, D_start = 1, t_max = 10, D_max = 10, i_max = 10^4){ data = array(0,c(D_max,((t_max+binsize) %/% binsize)+1)) for (i in 1:n){ series = sim_path(spp,ext,D_start,t_max,D_max,i_max) discrete = discretize(series,binsize,t_min=0,t_max=t_max) for (i_row in 1:nrow(discrete)){ row = discrete[i_row,] data[D_max-row$D,i_row] = data[D_max-row$D,i_row] + 1 } } return(data) } projection_sim = function(D_start = 1, t, spp, ext, n = 100){ D = c() for (i in 1:n){ series = sim_path(spp, ext, D_start) D = c(D, div_t(series, t)) } return(D) } multi_path_sim = function(n, spp, ext, D_start = 1, t_max = 100, D_max = 10^8, i_max = 10^4){ data = data.frame(t = c(), D = c()) for (i in 1:n){ data = rbind(data, sim_path(spp,ext,D_start, t_max, D_max, i_max)) } return(data) } multi_path_discrete = function(n, spp, ext, D_start = 1, t_max = 100, binsize = 1, D_max = 10^8, i_max = 10^4){ data = data.frame(t = c(), D = c()) for (i in 1:n){ series = rbind(data, sim_path(spp,ext,D_start, t_max, D_max, i_max)) series = discretize(series, binsize, t_max = t_max) data = rbind(data,series) } return(data) } # Likelihood ---- log_l_event = function(t, rate, n){ return(-1 * log(n * rate) + n * rate * t) } log_l_nonevent = function(t, rate, n){ l = n * rate * t if (l >= 0){ return(l) } else{ print("invalid likelihood") return(NaN) } } log_l_series = function(series, spp, ext, reverse = TRUE){ if (reverse){ series$t = -1 * series$t } rates = c(spp,ext) len = dim(series)[1] change = series[2:len,] - series[1:(len-1),] #print(change) l = 0 for (i_event in 1:(len-1)){ t_event = change[i_event,]$t D_event = series[i_event,]$D is_spp = change[i_event,]$D == 1 #print(c(t_event,D_event,is_spp)) if (is_spp){ l = l + log_l_event(t_event, spp, D_event) + log_l_nonevent(t_event, ext, D_event) } else{ l = l + log_l_event(t_event, ext, D_event) + log_l_nonevent(t_event, spp, D_event) } } return(l) } series_size = function(series,processed = FALSE){ if(!processed){ series = process_series(series) } return(sum(series$D * series$dt)) } ml_params = function(series,processed = TRUE){ if(!processed){ series = process_series(series) } n_spp = sum(series$type) n_ext = length(series[,1]) - n_spp size = series_size(series,processed) return(c(n_spp, n_ext) / size) } ml_vol = function(series, processed = TRUE){ if(!processed){ series = process_series(series) } n_events = length(series$t) size = series_size(series, processed) return(n_events / size) } # Stochastic probability functions ---- pnt = function(n0,n,t,spp,ext,alpha = NULL, beta = NULL){ if(n0 < 0){ print("Initial diversity cannot be negative") return(NaN) } if(spp < 0 | ext < 0){ print("Speciation and extinction rates cannot be negative") return(NaN) } # If initial diversity is 0, diversity will always be 0 if(n0 == 0){ if(n == 0){ return(1) } else{ return(0) } } if(n < 0){ return(0) } if(t == 0){ return(pnt_F(n0,n)) } if(spp == 0 & ext == 0){ return(pnt_F(n0,n)) } if(spp == 0 & ext > 0){ return(pnt_D(n0,n,t,ext)) } if(spp > 0 & ext ==0){ return(pnt_E(n0,n,t,spp)) } if(spp == ext){ return(pnt_A(n0,n,t,spp)) } return(pnt_B(n0,n,t,spp,ext)) } pnt_F = function(n0,n){ # Speciation = Extinction = 0 # When both rates are 0, diversity remains constant indefinitely if(n == n0){ return(1) } else{ return(0) } } pnt_D = function(n0,n,t,ext){ # Speciation = 0 # Extinction > 0 # A pure death process; each extinction can be treated as an independent random # variable with an exponential distribution if (n > n0){ return(0) } else{ p_ext = 1 - exp(-ext * t) return(choose(n0,n) * (p_ext^(n0 - n)) * ((1 - p_ext)^n)) } } pnt_E = function(n0,n,t,spp){ # Speciation > 0 # Extinction = 0 # Yule process; equation from Stochastic Processes by Sheldon Ross, 1983 if (n < n0){ return(0) } else{ return( choose(n - 1, n0 - 1) * exp(-1 * spp * t * n0) * (1 - exp(-spp * t))^(n - n0) ) } } pnt_A = function(n0,n,t,vol){ # Speciation = Extinction # Equations from Raup 1985 # Equation A12 if (n == 0){ return( ((vol * t) / (1 + vol * t))^n0 ) } # Equation A16 else{ j = 1:min(n0,n) return( (((vol * t) / (1 + vol * t))^(n0 + n)) * sum(choose(n0,j) * choose(n-1,j-1) * ((vol * t)^(-2 * j))) ) } } pnt_B = function(n0,n,t,spp,ext){ # Speciation =/= Extinction # Equations from Raup 1985 # Equation A13 alpha = (ext*(exp((spp-ext)*t)-1))/(spp*exp((spp-ext)*t)-ext) # Equation A14 if(n == 0){ return(alpha^n0) } # Equation A18 else{ beta = alpha * spp / ext j = 0:min(n0,n) return(sum( choose(n0,j) * choose(n0+n-j-1,n0-1) * (alpha^(n0-j)) * (beta^(n-j)) * ((1-alpha-beta)^j) )) } } # Projection ---- p_dist = function(n0,max,t,spp,ext){ p = c() d = 0:max for (n in d){ p = c(p,pnt(n0,n,t,spp,ext)) } return(data.frame(D = d, p = p)) } mode_n = function(n0,t,spp,ext){ expected = n0 * exp((spp - ext) * t) range = 0:(round(expected) * 2) return(mode_n_sub(n0,t,spp,ext,range)) } mode_n_sub = function(n0,t,spp,ext,range){ bot = min(range) top = max(range) # print(c(bot,top)) if(length(range) < 3){ p_low = pnt(n0,bot,t,spp,ext) p_high = pnt(n0,top,t,spp,ext) if(p_low > p_high){ return(bot) } if(p_low < p_high){ return(top) } else{ return((bot+top)/2) } } mid = (bot + top) %/% 2 low = (bot + mid) %/% 2 high = (mid + top) %/% 2 p_low = pnt(n0,low,t,spp,ext) p_high = pnt(n0,high,t,spp,ext) if(p_low > p_high){ return(mode_n_sub(n0,t,spp,ext,bot:mid)) } else{ return(mode_n_sub(n0,t,spp,ext,mid:top)) } } conf_pnt = function(n0,t,spp,ext,int = 0.95){ if(t == 0){ return(data.frame(lower = n0, upper = n0, p = 1)) } center = mode_n(n0,t,spp,ext) p = pnt(n0,center,t,spp,ext) lower = center - 1 upper = center + 1 while(p < int){ p_upper = pnt(n0,upper,t,spp,ext) p_lower = pnt(n0,lower,t,spp,ext) if(p_upper > p_lower){ p = p + p_upper upper = upper + 1 } else{ p = p + p_lower lower = lower - 1 } } return(data.frame(lower = lower + 1, upper = upper - 1, p = p)) } conf_series = function(n0,t_series,spp,ext,int = 0.95){ data = data.frame() for (t in t_series){ step = cbind(t = t, conf_pnt(n0,t,spp,ext,int)) data = rbind(data, step) } return(data) } density_plot = function(n0,t_series,spp,ext,n_max,interpolate=FALSE, contours = TRUE, confints = c(), breaks = c(0.05,0.1,0.25,0.5,0.9,0.95)){ data = data.frame() for(time in t_series){ for(n in 0:n_max){ data = rbind(data, data.frame(t = time, n = n, p = pnt(n0,n,time,spp,ext))) } } plt = ggplot(data,aes(x = t, y = n)) plt = plt + geom_raster(aes(fill = p),interpolate=interpolate) if(contours){ plt = plt + geom_contour(aes(z = p),colour = "white",breaks = breaks, show.legend = TRUE) } for(interval in confints){ conf_data = conf_series(n0,t_series,spp,ext,interval) plt = plt + geom_line(data=conf_data,aes(x = t, y = lower),colour = "white") + geom_line(data=conf_data,aes(x = t, y = upper),colour = "white") } plt = plt + theme_minimal() plt } size_to_dev = function(size){ return(exp(-0.001984763*size)*0.508401) } conf_int_from_series = function(series, dt, confidence = 0.95){ vol = ml_vol(series) dev = size_to_dev(series_size(series, TRUE)) vol_high = qnorm(sqrt(confidence), vol, dev*vol) D_series = series$D n0 = D_series[length(D_series)] conf_int = conf_pnt(n0, dt, vol_high/2, vol_high/2, sqrt(confidence)) conf_int$p = sqrt(confidence)*conf_int$p conf_int$vol = vol conf_int$dt = dt return(conf_int) } conf_series_from_D_series = function(D_series, dt_series, confidence = 0.95){ vol = ml_vol(D_series) dev = size_to_dev(series_size(D_series, TRUE)) vol_high = qnorm(sqrt(confidence), vol, dev*vol) t0 = D_series$t[nrow(D_series)] D_series = D_series$D n0 = D_series[length(D_series)] conf_series = data.frame() for(dt in dt_series){ int = conf_pnt(n0, dt, vol_high/2, vol_high/2, sqrt(confidence)) conf_series = rbind(conf_series, int) } conf_series$p = sqrt(confidence) * conf_series$p conf_series$dt = dt_series conf_series$t = t0 - conf_series$dt return(conf_series) } t = seq(0.1,5,0.1) # Data Management ---- load_data = function(path="G:/My Drive/Thesis - Synced/pbdb_data_full.csv"){ # Loads fossil data and sorts it by genus and species. Returns it as a data frame. # path: The filepath for a csv from the paleobiology database. pbdb_data = read.csv(path, skip = 21, stringsAsFactors = FALSE) return(pbdb_data) } select_fams = function(data, families = c("Mytilidae","Veneridae","Lucinidae","Pectinidae","Pholadomyidae")){ return(data[data$family %in% families,]) } organize_data = function(data){ # Organizes a data csv into a list of genera. Each # row in a genus is a species. The columns record the species' age range and # geographic range. Returns the list of genera. # data: A csv of fossil data generated by load_data. # Count the number of entries n_occurences = length(data$occurence_no) # Assign an age to each fossil randomly distribued between its minimum # and maximum ages data$age = runif(length(data$min_ma),min=data$min_ma,max=data$max_ma) # Iterate through genera family_list = split(data,data$family,drop=TRUE) family_data_list = list() name_list = c() for (family in family_list){ species_list = split(family,family$accepted_name,drop=TRUE) family_name = as.character(species_list[[1]]$family[1]) name_list = c(name_list,family_name) family_data = c() # Iterate through species species_names = c() for (species in species_list){ # Record relevant data species_name = as.character(species$accepted_name[1]) oldest_occ = max(species$age) youngest_occ = min(species$age) n_occurences = nrow(species) species_data = data.frame(oldest_occ,youngest_occ, n_occurences) family_data = rbind(family_data,species_data) species_names = c(species_names,species_name) } rownames(family_data) = species_names family_data_list = c(family_data_list, list(family_data)) } names(family_data_list) = name_list family_data_list } randomize_ages = function(pbdb_data){ pbdb_data$age = runif(length(pbdb_data$min_ma), pbdb_data$min_ma, pbdb_data$max_ma) return(pbdb_data) } compile_spp = function(family_data){ family_data = randomize_ages(family_data) spp_list = split(family_data, family_data$accepted_name) spp_list = lapply(spp_list, compile_spp_sub) spp_data = rbindlist(spp_list) return(spp_data) } compile_spp_sub = function(species){ name = species$accepted_name[1] oldest_occ = max(species$age) youngest_occ = min(species$age) n_occurences = nrow(species) return(data.frame(oldest_occ,youngest_occ, n_occurences, name)) } countrefs = function(data){ allrefs = data.frame(reference_no = data$reference_no, ref_pubyr = data$ref_pubyr) family_list = split(allrefs, data$family, drop=TRUE) family_list = lapply(family_list, FUN = countrefs_sub) return(family_list) } countrefs_sub = function(family){ refs = unique(family) refs = na.omit(refs) first = min(refs$ref_pubyr) last = max(refs$ref_pubyr) cumrefs = data.frame(year = first:last, count = rep.int(0, last-first + 1)) for(yr in refs$ref_pubyr){ cumrefs$count = cumrefs$count + c(rep.int(0,yr-first),rep.int(1,last-yr+1)) } return(cumrefs) } age_cutoff = function(data, age){ return(data[data$min_ma < age]) } extend_ranges = function(clade){ preservation_rate = clade$n_occurences/(clade$oldest_occ - clade$youngest_occ) infs = is.infinite(preservation_rate) preservation_rate[infs] = median(preservation_rate[!infs]) clade$oldest_occ = clade$oldest_occ + rexp(preservation_rate) clade$youngest_occ = clade$youngest_occ - rexp(preservation_rate) clade$youngest_occ = pmax(clade$youngest_occ, 0) return(clade) } series_from_clade = function(clade, range = c(Inf, 0.1)){ speciations = clade$oldest_occ speciations = speciations[speciations > range[2]] #speciations = speciations[-which.max(speciations)] n_spp = length(speciations) extinctions = clade$youngest_occ extinctions = extinctions[extinctions > range[2]] n_ext = length(extinctions) series = data.frame(t = speciations, dD = rep.int(1,n_spp), type = rep.int(TRUE,n_spp)) series = rbind(series, data.frame( t = extinctions, dD = rep.int(-1,n_ext), type = rep.int(FALSE,n_ext) )) series = series[order(series$t,decreasing = TRUE),] n_events = length(series$t) D = rep.int(0,n_events) for (i_event in 1:(n_events)){ change = series[i_event,]$dD D[(i_event):n_events] = D[(i_event):n_events] + change } series$D = D series$dt = c(max(clade$oldest_occ),series$t[1:(n_events-1)]) - series$t series = series[series$t < range[1],] series } time_section = function(clade, min, max = 700){ return(clade[clade$t > min & clade$t < max,]) } estimate_params = function(family_list, min_D){ param_data = data.frame() allnames = names(family_list) namelist = c() for (i in 1:length(family_list)){ family = family_list[[i]] if (dim(family)[1] >= min_D){ family = series_from_clade(family) vol = ml_vol(family) maxD = max(family$D) name = allnames[i] param_data = rbind(param_data, data.frame(vol = vol, maxD = maxD, name = name)) namelist = c(namelist,name) } } rownames(param_data) = namelist param_data } aic_test = function(series){ params = ml_params(series, TRUE) spp = params[1] ext = params[2] vol = spp + ext lnl1 = log_l_series(series,vol/2,vol/2) lnl2 = log_l_series(series,spp,ext) AIC_1 = 2 - 2 * lnl1 AIC_2 = 4 - 2 * lnl2 min_aic = min(AIC_1,AIC_2) vol_model = data.frame(param_1 = vol, param_2 = NaN, likelihood = lnl1, AIC = AIC_1, preference = exp((min_aic - AIC_1)/2)) spp_ext_model = data.frame(param_1 = spp, param_2 = ext, likelihood = lnl2, AIC = AIC_2, preference = exp((min_aic - AIC_2)/2)) return(rbind(vol_model,spp_ext_model)) } estimation_test = function(n_trials){ frame = data.frame() for (trial in 1:n_trials){ size = 0 len = 0 while (size < 2 | len < 3){ vol = runif(1,0.0001,3) series = sim_path(vol/2,vol/2,i_max = 100, D_max = 500, t_max = 500) size = series_size(series, processed = FALSE) len = dim(series)[1] } estimate = ml_vol(series,processed = FALSE) frame = rbind(frame, data.frame(sim_vol = vol, est_vol = estimate, size = size, err = (estimate-vol)/vol)) } return(frame) } # Data validation ---- spp_data = split(pbdb_data, pbdb_data$accepted_no) spp_ids = names(spp_data) singleton_ids = c() one_bin_ids = c() one_pub_ids = c() for (id in spp_ids){ spp = spp_data[[id]] if(nrow(spp) == 1){ singleton_ids = c(singleton_ids, id) } else{ n_bins = length(unique(spp$early_interval)) if(n_bins == 1){ one_bin_ids = c(one_bin_ids, id) } else{ n_occ = length(spp$occurence_no) if(n_occ < 5){ n_pubs = length(unique(spp$reference_no)) if(n_pubs == 1){ one_pub_ids = c(one_pub_ids, id) } } } } } one_bin_spp = spp_data[one_bin_ids] one_pub_spp = spp_data[one_pub_ids] data_no_singletons = pbdb_data[!(pbdb_data$accepted_no %in% as.integer(singleton_ids)),] family_list_ns = split(data_no_singletons, data_no_singletons$family, drop = TRUE) # Clades to analyze: Mytilidae, Lucinidae, Veneridae, Pectinidae, Pholadomyidae mytilidae_ns = family_list_ns$Mytilidae lucinidae_ns = family_list_ns$Lucinidae veneridae_ns = family_list_ns$Veneridae pectinidae_ns = family_list_ns$Pectinidae pholadomyidae_ns = family_list_ns$Pholadomyidae # Error Estimation ---- error_sim = function(vol_range, trials, min_size = 100){ sim_results = data.frame() for(vol_sim in vol_range){ for(trial in 1:trials){ sim_data = process_series(sim_path(vol_sim/2, vol_sim/2, i_max = 1000)) size = series_size(sim_data, TRUE) if(size > min_size){ vol_est = ml_vol(sim_data, TRUE) sim_results = rbind(sim_results, data.frame(vol_sim = vol_sim, vol_est = vol_est, size = size)) } } } sim_results$err = sim_results$vol_est - sim_results$vol_sim sim_results$err_frac = sim_results$err / sim_results$vol_sim return(sim_results) } sd_sections = function(sim_results, breaks = 0:10*10){ n = length(breaks) results = data.frame() for(section in 1:(n-1)){ low = breaks[section] high = breaks[section+1] data = sim_results[sim_results$size > low & sim_results$size < high,] n = nrow(data) results = rbind(results, data.frame(min = low, max = high, sd = sd(data$err_frac), n = n)) } return(results) } sd_window = function(sim_results, window_size = 100, step = 1){ min_size = min(sim_results$size) max_size = max(sim_results$size) bottom_range = round(min_size):(round(max_size) - window_size) results = data.frame() for(bottom in bottom_range){ data = sim_results[sim_results$size > bottom & sim_results$size < bottom+window_size,] n = nrow(data) results = rbind(results, data.frame(min = bottom, max = bottom+window_size, sd = sd(data$err_frac), n = n)) } return(results) } # test = err[err$size > 20 & err$size < 30,] # hist(test$err_frac, breaks = ((-15:20)/10)) # Standard deviation of estimates seems to stabilize at 15% of true value for size > 100 ## Analysis ---- pbdb_data = load_data() pbdb_data = select_fams(pbdb_data) family_list = split(pbdb_data, pbdb_data$family, drop = TRUE) family_list_compiled = lapply(family_list, compile_spp) family_list_series = lapply(family_list_compiled, series_from_clade) family_list_ns_compiled = lapply(family_list_ns, compile_spp) family_list_ns_series = lapply(family_list_ns_compiled, series_from_clade) # Clades to analyze: Mytilidae, Lucinidae, Veneridae, Pectinidae, Pholadomyidae mytilidae = family_list_series$Mytilidae lucinidae = family_list_series$Lucinidae veneridae = family_list_series$Veneridae pectinidae = family_list_series$Pectinidae pholadomyidae = family_list_series$Pholadomyidae mytilidae_ns = family_list_ns_series$Mytilidae lucinidae_ns = family_list_ns_series$Lucinidae veneridae_ns = family_list_ns_series$Veneridae pectinidae_ns = family_list_ns_series$Pectinidae pholadomyidae_ns = family_list_ns_series$Pholadomyidae pbdb_strat = read.csv("http://paleobiodb.org/data1.1/intervals/list.txt?scale=all&limit=all") pbdb_strat = pbdb_strat[pbdb_strat$early_age < 252,] periods = pbdb_strat[pbdb_strat$level == 3,]$early_age stages = pbdb_strat[pbdb_strat$level == 5,]$early_age all_fam_series = series_from_clade(compile_spp(data_no_singletons)) ml_vol_dist = function(pbdb_data, trials){ vol_estimates = data.frame(vol_est = rep(NaN, trials), size = rep(NaN, trials)) for(i in 1:trials){ spp_data = compile_spp(pbdb_data) series = series_from_clade(spp_data) vol_estimates[i,] = c(ml_vol(series, TRUE), series_size(series, TRUE)) } vol_estimates$dev = apply(vol_estimates["size"], 1, size_to_dev)*vol_estimates$vol_est return(vol_estimates) } vol_freq = function(vol_dist, breaks){ breakdown = hist(vol_dist$vol_est, breaks = breaks, plot = FALSE) freq = breakdown$counts/nrow(vol_dist) data = data.frame(mid = breakdown$mids, freq = freq) return(data) } vol_range_p = function(range, vol_dist){ n = nrow(vol_dist) return( sum( pnorm(range[2], vol_dist$vol_est, vol_dist$dev) - pnorm(range[1], vol_dist$vol_est, vol_dist$dev) ) /n) } vol_dist_p = function(vol_dist, breaks){ n_bins = length(breaks) - 1 lower = breaks[1:n_bins] upper = breaks[2:(n_bins+1)] result = data.frame(lower = lower, upper = upper, p = rep(NaN, n_bins)) for(i_bin in 1:n_bins){ result$p[i_bin] = vol_range_p(c(lower[i_bin], upper[i_bin]), vol_dist) } return(result) } plot_p_dist = function(vol_dist, min_vol, max_vol, binwidth){ breaks = seq(min_vol, max_vol, binwidth) data = vol_dist_p(vol_dist, breaks) data$mid = (data$lower + data$upper)/2 plt = ggplot(data = data, aes(x = mid, y = p)) plt = plt + geom_col(width = binwidth) return(plt) } normal_bins = function(mean, sd, breaks){ n_bins = length(breaks) - 1 return(pnorm(breaks[2:(n_bins+1)],mean,sd) - pnorm(breaks[1:n_bins],mean,sd)) } ggplot(data = mytilidae_dist) + geom_histogram(aes(x = vol_est), breaks = breaks) #mytilidae_dist = ml_vol_dist(mytilidae, 1000) mytilidae_dist = Mytilidae_ns_vol_dist breaks = seq(0.10, 0.40, 0.001) mbreaks = seq(0.1, 0.15, 0.001) mytilidae_dist_p = vol_dist_p(mytilidae_dist, mbreaks) mytilidae_dist_p$mid = (mytilidae_dist_p$lower + mytilidae_dist_p$upper) / 2 mytilidae_dist_p$age_freq = vol_freq(mytilidae_dist, mbreaks)$freq mytilidae_dist_p$est_p = normal_bins(mean(mytilidae_dist$vol_est), size_to_dev(mean(mytilidae_dist$size)), mbreaks) # Error sources comparison plt = ggplot(mytilidae_dist_p, aes(x = mid))+ geom_col(aes(y = age_freq), fill = "blue", color = "blue") + geom_line(aes(y = p), color = "white", size = 2)+ geom_line(aes(y = est_p), color = "white", size = 2) + geom_line(aes(y = p), color = "purple3", size = 1)+ geom_line(aes(y = est_p), color = "red", size = 1) + xlab("Volatility") + ylab("Probability") + theme_minimal() plt lbreaks = seq(0.18,0.26,0.001) Lucinidae_dist_p = vol_dist_p(Lucinidae_ns_vol_dist, lbreaks) Lucinidae_dist_p$mid = (Lucinidae_dist_p$lower + Lucinidae_dist_p$upper) / 2 Lucinidae_dist_p$age_freq = vol_freq(Lucinidae_ns_vol_dist, lbreaks)$freq Lucinidae_dist_p$est_p = normal_bins(mean(Lucinidae_ns_vol_dist$vol_est), size_to_dev(mean(Lucinidae_ns_vol_dist$size)), lbreaks) # Error sources comparison plt = ggplot(Lucinidae_dist_p, aes(x = mid))+ geom_col(aes(y = age_freq), fill = "blue", color = "blue") + geom_line(aes(y = p), color = "white", size = 2)+ geom_line(aes(y = est_p), color = "white", size = 2) + geom_line(aes(y = p), color = "purple3", size = 1)+ geom_line(aes(y = est_p), color = "red", size = 1) + xlab("Volatility") + ylab("Probability") + theme_minimal() plt # This analysis takes about an hour vol_results = lapply(family_list, ml_vol_dist, 1000) vol_results_ns = lapply(family_list_ns, ml_vol_dist, 1000) for(family_name in names(vol_results)){ result = vol_results[[family_name]] write.csv(result, file = paste(family_name, "_vol_dist", ".csv",sep = "")) } for(family_name in names(vol_results_ns)){ result = vol_results_ns[[family_name]] write.csv(result, file = paste(family_name, "_ns_vol_dist", ".csv",sep = "")) } vol_dist_plot = function(family_name, full_data, ns_data, breaks = breaks, pmax = 0.27){ plt = ggplot() dist_full = vol_dist_p(full_data[[family_name]], breaks) dist_ns = vol_dist_p(ns_data[[family_name]], breaks) dist_full$mid = (dist_full$low + dist_full$high)/2 dist_ns$mid = (dist_ns$low + dist_ns$high)/2 plt = plt + geom_line(data = dist_full, aes(x = mid, y = p), color = "red") plt = plt + geom_line(data = dist_ns, aes(x = mid, y = p), color = "blue") mean_full = mean(full_data[[family_name]]$vol_est) mean_ns = mean(ns_data[[family_name]]$vol_est) plt = plt + geom_segment(data = dist_full, aes(x = mean_full, xend = mean_full, y = 0, yend = pmax), color = "red", linetype = "dashed") plt = plt + geom_segment(data = dist_ns, aes(x = mean_ns, xend = mean_ns, y = 0, yend = pmax), color = "blue", linetype = "dashed") plt = plt + ylim(c(0,pmax)) plt = plt + xlab(NULL) + ylab(NULL) plt = plt + theme_minimal() plt = plt + ggtitle(family_name) + theme(plot.title = element_text(hjust = 0.5)) return(plt) } p1 = vol_dist_plot("Mytilidae", vol_results, vol_results_ns, breaks) p2 = vol_dist_plot("Lucinidae", vol_results, vol_results_ns, breaks) p3 = vol_dist_plot("Veneridae", vol_results, vol_results_ns, breaks) p4 = vol_dist_plot("Pectinidae", vol_results, vol_results_ns, breaks) p5 = vol_dist_plot("Pholadomyidae", vol_results, vol_results_ns, breaks) grid.arrange(p1,p2,p3,p4,p5,nrow = 5) aic_results = data.frame() for(family_name in names(family_list_ns_series)){ family = family_list_ns_series[[family_name]] if(family_name == "Pholadomyidae"){ family = family[family$t > 70 & family$t < 170,] } else{ family = family[family$t < 100,] } test = aic_test(family) test = test[1,] test$family = family_name aic_results = rbind(aic_results, test) } ## Error Analysis ---- err = error_sim(1:20/100, 500) err_reduced = err[err$size < 1000,] #err_reduced = err_reduced[err_reduced$size > 10,] err_reduced = err_reduced[err_reduced$err_frac < 10,] plot(err_reduced$size, err_reduced$err_frac) err_sd = sd_sections(err, 1:10*100) err_sd$ln_sd = log(err_sd$sd) err_sd$sd_predicted = size_to_dev(err_sd$max) plot(err_sd$max, err_sd$sd) plot(err_sd$min, err_sd$ln_sd) err_win = sd_window(err) plot(err_win$min, err_win$sd) err_win_10 = sd_window(err, window_size = 10) plot(err_win_10$min, err_win_10$sd) err_win_10$sd_ln = log(err_win_10$sd) err_win_10$sd_predicted = size_to_dev(err_win_10$min) plot(err_win_10$min, err_win_10$sd_ln) plt = ggplot(data = err_win_10) plt = plt + theme_minimal() plt = plt + geom_point(aes(x = min, y = sd), alpha = 1/5) plt = plt + geom_line(aes(x = min, y = sd_predicted)) plt plt = ggplot(data = err) plt = plt + theme_minimal() plt = plt + geom_point(aes(x = size, y = err_frac), alpha = 1/5) plt err_win_10$mid = (err_win_10$min+err_win_10$max)/2 ln_sd_line = lm(err_win_10$sd_ln ~ err_win_10$mid) c1 = ln_sd_line$coefficients[2] c2 = exp(ln_sd_line$coefficients[1]) ## PLOTTING ---- # Plot diversity series d_series_plot = function(series){ plt = ggplot(series) plt = plt + theme_minimal() plt = plt + geom_line(aes(x = t, y = D)) plt = plt + scale_x_reverse() return(plt) } projection_plot = function(series, t, xmin = 0, confidence = 0.95){ plt = d_series_plot(series) projection = conf_series_from_D_series(time_section(series,t), t:xmin, confidence = confidence) plt = plt + geom_line(data = projection, color = "blue", linetype = "dotted", size = 1.5, lineend = "round", aes(x = projection$t, y = projection$lower)) plt = plt + geom_line(data = projection, color = "blue", linetype = "dotted", size = 1.5, lineend = "round", aes(x = projection$t, y = projection$upper)) return(plt) } # Back-projection p1 = projection_plot(pholadomyidae_ns, 70)+ggtitle("Pholadomyidae", )+ theme(plot.title=element_text(hjust=0.5)) p2 = projection_plot(lucinidae_ns, 70)+ggtitle("Lucinidae", )+ theme(plot.title=element_text(hjust=0.5)) p3 = projection_plot(mytilidae_ns, 70)+ggtitle("Mytilidae", )+ theme(plot.title=element_text(hjust=0.5)) p4 = projection_plot(pectinidae_ns, 70)+ggtitle("Pectinidae", )+ theme(plot.title=element_text(hjust=0.5)) p5 = projection_plot(veneridae_ns, 70)+ggtitle("Veneridae", )+ theme(plot.title=element_text(hjust=0.5)) grid.arrange(p1,p2,p3,p4,p5) p2 = projection_plot(lucinidae_ns, 0,50)+ggtitle("Lucinidae", )+ theme(plot.title=element_text(hjust=0.5)) p3 = projection_plot(mytilidae_ns, 0,50)+ggtitle("Mytilidae", )+ theme(plot.title=element_text(hjust=0.5)) p4 = projection_plot(pectinidae_ns, 0,50)+ggtitle("Pectinidae", )+ theme(plot.title=element_text(hjust=0.5)) p5 = projection_plot(veneridae_ns, 0,50)+ggtitle("Veneridae", )+ theme(plot.title=element_text(hjust=0.5)) grid.arrange(p2,p3,p4,p5) # Plot fossil spp and ext estimates param_data = estimate_params(family_list, 100) plt = ggplot(param_data, aes(x = vol, y = maxD)) plt = plt + geom_text(aes(label = name), vjust = -0.5, hjust = -0.1) plt = plt + theme_minimal() plt = plt + geom_point(colour = "dark green") plt # Plot estimation accuracy for simulated volatility sim_data = estimation_test(500) plt = ggplot(sim_data,aes(x = sim_vol, y = est_vol, colour = size)) ms = max(sim_data$size) plt = plt + scale_color_gradientn(colours = c("red","orange","yellow"), values = c(0.0,0.01,0.2,1), breaks = c(0,20,300,ms)) plt = plt + theme_minimal() plt = plt + geom_point() plt = plt + coord_fixed() plt plt = ggplot(sim_data,aes(x=size,y=err)) plt = plt + theme_minimal() plt = plt + geom_point(colour = "blue") plt err = 0 while(err < 5){ len = 0 while (len < 5){ vol = runif(1,0.001,3) sim = sim_path(vol/2,vol/2,i_max = 100) len = dim(sim)[1] } estimate = ml_vol(sim,FALSE) err = estimate / vol } print(sim) print(vol) print(estimate) # Plot diversity of all bivalves all_bivalves = load_data("G:/My Drive/Thesis - Synced/all_bivalves.csv") bivalve_spp = compile_spp(all_bivalves) bivalve_series = series_from_clade(bivalve_spp) bivalve_plot = d_series_plot(bivalve_series) + xlab("Age (Ma)") + ylab("Diversity") bivalves_mesozoic = time_section(bivalve_series, 65, 252) bivalves_cenozoic = time_section(bivalve_series, 0,65) #Volatility example plot l1 = sim_path(1,1,10) l2 = sim_path(1,1,10) l3 = sim_path(1,1,10) h1 = sim_path(2,2,10) h2 = sim_path(2,2,10) h3 = sim_path(2,2,10) plt = ggplot() for(series in list(l1,l2,l3)){ plt = plt + geom_line(data = series, aes(x = t, y = D), colour = "blue") } for(series in list(h1,h2,h3)){ plt = plt + geom_line(data = series, aes(x = t, y = D), colour = "red") } plt = plt + theme_minimal() + xlab("Time") + ylab("Diversity") plt d_series_plot(pectinidae_ns) d_series_plot(series_from_clade(compile_spp(pectinidae_ns))) + xlim(275,0) + ylim(0,125) + xlab("Time") + ylab("Diversity")