# Read in data #setwd("C:\\Documents and Settings\\hap7\\My Documents\\research\\ELPUB\\") setwd("Documents") setwd("ELPUB 2008") dat = read.csv("ELPUB 2008 Piwowar data.csv", header=TRUE, row.names=1) # set the NAs to be zero so that we can compute with the subdisciplines dat[is.na(dat)] = 0 dim(dat) rownames(dat) colnames(dat) median.IF = median(dat$Impact.Factor, na.rm=TRUE) # calculate some useful fields dat$percent.microarray = dat$Record.Count/dat$Articles # A quick summary of the data ### Impact factor table(dat$strength.tri) windows() quartz() boxplot(Impact.Factor ~ strength.tri, data = dat, boxwex = 0.5, names=c("No data sharing policy", "Weak policy", "Strong policy"), ylab = "Journal impact factor", outline=T, notch=F, log="y", horizontal=FALSE) print("Strength of Policy") print("Median impact factor") tapply(dat$Impact.Factor, dat$strength.tri, median) ### Publisher print("Strength of Policy") print("Association, Commercial") tt = table(dat$strength.tri, dat$Published.by.Association) tt # Distribution of Published.by.Association journals across policy strength categories round(tt[,2]/(tt[,1]+tt[,2]), 2) # Percent with no policy round(tt[1,]/(tt[1,]+tt[2,]+tt[3,]), 2) ### Open access print("Strength of Policy") print("Closed, Open") tt = table(dat$strength.tri, dat$open.access) tt # Distribution of open access journals across policy strength categories round(tt[,2]/(tt[,1]+tt[,2]), 2) # Percent with no policy round(tt[1,]/(tt[1,]+tt[2,]+tt[3,]), 2) #### Prevalence table(dat$strength.tri) windows() quartz() boxplot(conserv.GEO.percent ~ strength.tri, data = dat, boxwex = 0.5, names=c("No data sharing policy", "Weak policy", "Strong policy"), ylab = "Data sharing prevalence within journals", outline=T, notch=F, horizontal=FALSE) ## Breakdown by impact factor windows() quartz() boxplot(conserv.GEO.percent ~ strength.tri, data = dat, subset = dat$ln.impact.factor <= median(dat$ln.impact.factor), boxwex = 0.5, names=c("No data sharing policy", "Weak policy", "Strong policy"), ylab = "Data sharing prevalence within journals", outline=T, notch=F, horizontal=FALSE) windows() quartz() boxplot(conserv.GEO.percent ~ strength.tri, data = dat, subset = dat$ln.impact.factor > median(dat$ln.impact.factor), boxwex = 0.5, names=c("No data sharing policy", "Weak policy", "Strong policy"), ylab = "Data sharing prevalence within journals", outline=T, notch=F, horizontal=FALSE) ## Breakdown by association windows() quartz() boxplot(conserv.GEO.percent ~ strength.tri, data = dat, subset = dat$Published.by.Association <= 0, boxwex = 0.5, names=c("No data sharing policy", "Weak policy", "Strong policy"), ylab = "Data sharing prevalence within journals", outline=T, notch=F, horizontal=FALSE) windows() quartz() boxplot(conserv.GEO.percent ~ strength.tri, data = dat, subset = dat$Published.by.Association > 0, boxwex = 0.5, names=c("No data sharing policy", "Weak policy", "Strong policy"), ylab = "Data sharing prevalence within journals", outline=T, notch=F, horizontal=FALSE) print("Strength of Policy") print("Median data sharing prevalence") tapply(dat$conserv.GEO.percent, dat$strength.tri, median, na.rm=T) ### Multivariate analysis ## Primary analysis result.primary = lm(strength.bi ~ ln.impact.factor. + Published.by.Association + open.access + BIOCHEMISTRY...MOLECULAR.BIOLOGY + BIOTECHNOLOGY...APPLIED.MICROBIOLOGY + PLANT.SCIENCES + ONCOLOGY + CELL.BIOLOGY + GENETICS...HEREDITY + MULTIDISCIPLINARY.SCIENCES, dat) summary(result.primary) ## Prevalence result.primary = lm(conserv.GEO.percent ~ strength.bi + ln.impact.factor. + Published.by.Association + open.access + BIOCHEMISTRY...MOLECULAR.BIOLOGY + BIOTECHNOLOGY...APPLIED.MICROBIOLOGY + PLANT.SCIENCES + ONCOLOGY + CELL.BIOLOGY + GENETICS...HEREDITY + MULTIDISCIPLINARY.SCIENCES, dat) summary(result.primary)