rm(list=ls()) source("/Users/teeniematlock/Desktop/statistics/scripts/bodos_functions.R") source("/Users/teeniematlock/Desktop/statistics/scripts/classic.R") options(show.signif.stars=FALSE) library(glmmADMB);library(lme4) setwd("/Users/teeniematlock/Desktop/research/cultural_evolution/L2_typology_brandnew/") kasus = read.csv("kasus.csv",header=TRUE) # contains all 66 languages ############################ ANALYSIS ############################ Looking at the distributions setwd("high_res_figures_for_paper") png(file="cb_bw_L2case_fig1.png",res=300,units="in",type="quartz",width=10,height=4) quartz(width=10,height=4) par(mai=c(1,1,0.2,1),mfrow=c(1,2)) mybreaks = c(seq(0,1,0.2)+0.05,seq(0,1,0.2)-0.05) mybreaks = sort(mybreaks) par(mai=c(1,1,0.2,1)) hist(kasus$Percent.L2,xlim=c(-0.05,1.05),breaks=mybreaks,col="lightgray",freq=FALSE,main="",xlab="",ylab="",yaxt="n") axis(side=2,at=seq(0,12,2)/6.6,labels=seq(0,12,2)) axis(side=4,at=seq(0,2,0.5),labels=seq(0,2,0.5)) abline(h=0) lines(density(kasus$Percent.L2)$x,density(kasus$Percent.L2)$y,lwd=2) mtext(side=1,"L2 speaker proportion",cex=1.5,line=3) mtext(side=4,"Density estimates",cex=1.25,line=2.5) mtext(side=2,"Frequency counts",cex=1.25,line=2.5) text(x=0.9,y=1.9,"(a)",cex=2.25) rm(mybreaks) hist(kasus$Case.Rank,xlim=c(-0.5,7.5),col="lightgray",freq=FALSE,main="",xlab="",ylab="",xaxt="n",yaxt="n",breaks=seq(-0.5,7.5,1)) axis(side=1,at=0:7,labels=0:7) axis(side=4,at=seq(0,0.4,0.1),labels=seq(0,0.4,0.1)) axis(side=2,at=seq(0,30,5)/66,labels=seq(0,30,5)) abline(h=0) lines(density(kasus$Case.Rank)$x,density(kasus$Case.Rank)$y,lwd=2) mtext(side=1,"Case Rank",cex=1.5,line=3) text(x=6.5,y=0.45,"(b)",cex=2.25) pdfc(file="cb_bw_L2case_fig1.pdf",width=10,height=4) dev.off() # looks somewhat bimodal univariate(kasus$Case.Rank) univariate(kasus$Percent.L2) xtabs(~kasus$Case.Rank) # 31 languages have no case 31/nrow(kasus) # proportion of languages without case aggregate(Percent.L2 ~ Case.Presence, data=kasus, FUN=mean) aggregate(Percent.L2 ~ Case.Presence, data=kasus, FUN=sd) ############################ ANALYSIS STEP 1: ############################ CATEGORICAL DATA ANALYSIS (GENERAL) ############################ AND CONTINUOUS DATA ANALYSIS (GENERAL) xmdl = lmer(Case.Presence ~ Percent.L2 + (1+Percent.L2|Stock) + (1+Percent.L2|Region), data=kasus, family="binomial"(link="logit")) xmdl.red = lmer(Case.Presence ~ 1 + (1+Percent.L2|Stock) + (1+Percent.L2|Region), data=kasus, family="binomial") anova(xmdl.red,xmdl,test="Chisq") summary(xmdl) # PLOTS WITH MODELS png(file="cb_bw_L2case_fig2.png",res=300,units="in",type="quartz",width=10,height=4) quartz(width=10,height=4) par(mai=c(1,1,0.5,0.5),mfrow=c(1,2)) plot(kasus$Percent.L2,jitter(kasus$Case.Presence,factor=0.25),xlab="",ylab="",yaxt="n") axis(side=2,at=c(0,1),labels=c("No Case","Case"),cex.axis=1.5) text("(a)",x=0.9,y=0.9,cex=2.25) mtext(side=1,line=3.25,cex=1.5,"Proportion of L2 speakers") xintercept = 1.457604 # taken from the model above xslope = -6.572822 xvals = seq(0,1,0.001) LP = xintercept + xslope*xvals xfitted = logit.inv(LP) lines(xvals,xfitted,lwd=2) plot(Case.Rank ~ Percent.L2, data=kasus, pch=21,xlab="",ylab="") mtext(side=1,"Proportion of L2 speakers", cex=1.5, line=3.25) mtext(side=2,"Case Rank", cex=1.5, line=3) text("(b)",x=0.9,y=6,cex=2.25) xintercept=1.740566 # taken from the negative binomial model below xslope=-3.637383 xvals=seq(0,1,0.001) LP = xintercept + xslope*xvals xfitted = exp(LP) lines(xvals,xfitted,lwd=2) # xintercept=1.7219871 # this is the model without Case Rank == 0 # xslope=-0.8485025 # LP = xintercept + xslope*xvals # xfitted = exp(LP) # lines(xvals,xfitted,lwd=2,lty="dotdash") rm(xintercept,xslope,xvals,LP,xfitted) quartz.save("cb_bw_L2case_fig2.png",type="png",dpi=300) pdfc(file="cb_bw_L2case_fig2.pdf",width=10,height=4) dev.off() case.pr.stock = data.frame( aggregate(kasus$Percent.L2~kasus$Stock,FUN=mean), aggregate(as.numeric(kasus$Case.Presence)~kasus$Stock,FUN=mean)[,2]) names(case.pr.stock) = c("Stock","Percent.L2","Case.Presence") case.pr.region = data.frame( aggregate(kasus$Percent.L2~kasus$Region,FUN=mean), aggregate(as.numeric(kasus$Case.Presence)~kasus$Region,FUN=mean)[,2]) names(case.pr.region) = c("Region","Percent.L2","Case.Presence") case.rank.stock = data.frame( aggregate(kasus$Percent.L2~kasus$Stock,FUN=mean), aggregate(kasus$Case.Rank~kasus$Stock,FUN=mean)[,2]) names(case.rank.stock) = c("Stock","Percent.L2","Case.Rank") case.rank.region = data.frame( aggregate(kasus$Percent.L2~kasus$Region,FUN=mean), aggregate(kasus$Case.Rank~kasus$Region,FUN=mean)[,2]) names(case.rank.region) = c("Region","Percent.L2","Case.Rank") ## Linear model analysis because of sparsity concerns regarding not ## enough data points per random effect level summary(lm(Case.Rank ~ Percent.L2, data=case.rank.stock)) summary(lm(Case.Rank ~ Percent.L2, data=case.rank.region)) summary(lm(Case.Presence ~ Percent.L2, data=case.pr.stock)) summary(lm(Case.Presence ~ Percent.L2, data=case.pr.region)) summary(xmdl.red) summary(xmdl) summary(xmdl.red) xmdl = lm(Case.Presence ~ Percent.L2, data=case.pr.region) summary(xmdl) #png(file="cb_bw_L2case_fig3.png",res=300,units="in",type="quartz",width=10,height=4) pdf(file="cb_bw_L2case_fig3.pdf",width=8,height=4) quartz(width=8,height=6) par(mfrow=c(2,2),mai=c(0.5,0,0,0.25),oma=c(2,8,3,0)) # plot 1 plot(case.pr.stock$Percent.L2,case.pr.stock$Case.Presence,xlab="",ylab="",main="",xlim=c(-0.03,1.03),ylim=c(-0.03,1.03)) lines(lowess(case.pr.stock$Percent.L2,case.pr.stock$Case.Presence)$x,lowess(case.pr.stock$Percent.L2,case.pr.stock$Case.Presence)$y,lwd=2) #abline(lm(Case.Presence ~ Percent.L2, data=case.pr.stock),lwd=2,lty=2) mtext(side=2,cex=1.5,"Proportion of \nCase Presence",line=3.25) mtext(side=3,cex=1.5,"Family Averages",line=1.25) # plot 2 plot(case.pr.region$Percent.L2,case.pr.region$Case.Presence,xlab="",ylab="",main="",yaxt="n",xlim=c(-0.03,1.03),ylim=c(-0.03,1.03)) lines(lowess(case.pr.region$Percent.L2,case.pr.region$Case.Presence)$x,lowess(case.pr.region$Percent.L2,case.pr.region$Case.Presence)$y,lwd=2) #abline(lm(Case.Presence ~ Percent.L2, data=case.pr.region),lwd=2,lty=2) mtext(side=3,cex=1.5,"Area Averages",line=1.25) # plot 3 plot(case.rank.stock$Percent.L2,case.rank.stock$Case.Rank,xlab="",ylab="",main="",xlim=c(-0.03,1.03),ylim=c(-0.21,7.21)) lines(lowess(case.rank.stock$Percent.L2,case.rank.stock$Case.Rank)$x,lowess(case.rank.stock$Percent.L2,case.rank.stock$Case.Rank)$y,lwd=2) #abline(lm(Case.Rank ~ Percent.L2, data=case.rank.stock),lwd=2,lty=2) mtext(side=2,cex=1.5,"Case Rank",line=3) mtext(side=1,"Proportion of L2 speakers",line=3) # plot 4 plot(case.rank.region$Percent.L2,case.rank.region$Case.Rank,xlab="",ylab="",main="",yaxt="n",xlim=c(-0.03,1.03),ylim=c(-0.21,7.21)) lines(lowess(case.rank.region$Percent.L2,case.rank.region$Case.Rank)$x,lowess(case.rank.region$Percent.L2,case.rank.region$Case.Rank)$y,lwd=2) #abline(lm(Case.Rank ~ Percent.L2, data=case.rank.region),lwd=2,lty=2) mtext(side=1,"Proportion of L2 speakers",line=3) quartz.save("cb_bw_L2case_fig3.pdf",type="pdf") quartz.save("cb_bw_L2case_fig3.png",type="png",dpi=300) #pdfc(file="cb_bw_L2case_fig3.pdf",width=10,height=4) dev.off() # ANALYSIS OF RANK DATA # is a zero-inflation term needed? xtabs(~kasus$Case.Rank)[1]/nrow(kasus) # actual frequency of zeros ppois(0,mean(kasus$Case.Rank)) # expected frequency of zeros # this means that I need zero-inflated poisson regression kasus.mdl = glmmadmb(Case.Rank ~ Percent.L2 + (1|Stock) + (1+Percent.L2|Region), zeroInflation=TRUE,data=kasus, family="poisson",link="log") # the model with (1+Percent.L2|Stock) did not converge # however, random slopes make more sense for regions anyway.... source("/Users/teeniematlock/Desktop/statistics/scripts/overdispersion_function.r") get.overdisp(kasus.mdl) # significant overdispersion ... but the overdispersion parameter was not tooo bad # should be 1 (= equal ratio of mean and variance), but is 1.5 # to double check, let's consider a negative binomial model xmdl = glmmadmb(Case.Rank ~ Percent.L2 + (1|Stock) + (1+Percent.L2|Region), zeroInflation=TRUE,data=kasus, family="nbinom",link="log") xmdl.full = glmmadmb(Case.Rank ~ Percent.L2 + I(Percent.L2^2) + (1|Stock) + (1+Percent.L2|Region), zeroInflation=TRUE,data=kasus, family="nbinom",link="log") anova(xmdl,xmdl.full) # not much evidence for a quadratic component summary(xmdl) summary(xmdl.full) summary(xmdl) fixef(xmdl) # for comparison because of the fact that we dropped the one random slope component xmdl.pois = lmer(Case.Rank ~ Percent.L2 + (1+Percent.L2|Stock) + (1+Percent.L2|Region), data=kasus, family="poisson",link="log") xmdl.red.pois = lmer(Case.Rank ~ 1 + (1+Percent.L2|Stock) + (1+Percent.L2|Region), data=kasus, family="poisson",link="log") anova(xmdl.red.pois,xmdl.pois) ############################ ANALYSIS STEP 1: ############################ DOING THE SAME ANALYSIS WITH NO INDO-EUROPEAN SUBSET # does the result still hold if you exclude the languages without case? kasus.red = kasus[kasus$Case.Presence != 0,] # because xmdl = glmmadmb(Case.Rank ~ Percent.L2 + (1|Stock) + (1+Percent.L2|Region), data=kasus.red, family="nbinom",link="log") summary(xmdl) # it doesn't xmdl = lmer(Case.Rank ~ Percent.L2 + (1+Percent.L2|Stock) + (1+Percent.L2|Region), data=kasus.red) xmdl.red = lmer(Case.Rank ~ 1 + (1+Percent.L2|Stock) + (1+Percent.L2|Region), data=kasus.red) anova(xmdl.red,xmdl) summary(xmdl) # now, do the same thing without the Indo-European languages xmdl = glmmadmb(Case.Rank ~ Percent.L2 + (1|Stock) + (1+Percent.L2|Region), zeroInflation=TRUE,data=kasus[kasus$Stock!="Indo-European",], family="nbinom",link="log") summary(xmdl) # Case Presence without Indo-European languages xmdl = lmer(Case.Presence ~ Percent.L2 + (1+Percent.L2|Stock) + (1+Percent.L2|Region), data=kasus[kasus$Stock!="Indo-European",], family="binomial") xmdl.red = lmer(Case.Presence ~ 1 + (1+Percent.L2|Stock) + (1+Percent.L2|Region), data=kasus[kasus$Stock!="Indo-European",], family="binomial") anova(xmdl.red,xmdl) summary(xmdl) ############################ INFLUENCE DIAGNOSTICS red.est=c() for(i in 1:nrow(kasus)){ xmdl.red = lmer(Case.Presence ~ Percent.L2 + (1+Percent.L2|Stock) + (1+Percent.L2|Region), data=kasus[-i,], REML=FALSE,family="binomial") red.est = rbind(red.est, fixef(xmdl.red)) } # for comparison: intercept xmdl = 1.457, slope = -6.5728 hist(red.est[,1]) hist(red.est[,2]) # they are all negative... good(!) red.est=c() for(i in 1:nrow(kasus)){ # have to do poisson with lmer() coz glmmadmb() does not converge a lot of the time xmdl.red = lmer(Case.Rank ~ Percent.L2 + (1+Percent.L2|Stock) + (1+Percent.L2|Region), data=kasus[-i,], family="poisson",link="log") red.est = rbind(red.est, fixef(xmdl.red)) } # for comparison: intercept xmdl = 1.457, slope = -6.5728 hist(red.est[,1]) hist(red.est[,2]) # they are all negative... good(!) ############################ THE EFFECT OF POPULATION SIZE ## for comparison xmdl = lmer(Case.Presence ~ log(L1.Estimate+L2.Estimate) + (1+log(L1.Estimate+L2.Estimate)|Stock) + (1+log(L1.Estimate+L2.Estimate)|Region), data=kasus, family="binomial") xmdl.red = lmer(Case.Presence ~ 1 + (1+log(L1.Estimate+L2.Estimate)|Stock) + (1+log(L1.Estimate+L2.Estimate)|Region), data=kasus, family="binomial") anova(xmdl.red,xmdl) summary(xmdl) kasus$Log.Pop.Size = log(kasus$L1.Estimate+kasus$L2.Estimate) xmdl = glmmadmb(Case.Rank ~ Log.Pop.Size + (1+Log.Pop.Size|Stock) + (1+Log.Pop.Size|Region), zeroInflation=TRUE,data=kasus, family="nbinom",link="log") summary(xmdl) fixef(xmdl)