Use R and WDI to calculate number of missing women

##################################################################################

# Code for using World Development Indicators to understand missing women problem
# http://data.worldbank.org/data-catalog/world-development-indicators

# You have to set your directory below with setwd()

##################################################################################

# Preliminaries
# This clears any data in memory etc
rm(list=ls())

# This set of packages can be used to conduct the analysis
library(gmodels)
library(psych)
library(gdata)
library(pastecs)
library(doBy)
library(WDI)
library(ggplot2)
library(countrycode)
library(memisc)

# Set working directory
setwd()

#check the directory with
getwd()

# Create a list of variables to import
wdilist <- c(“NY.GDP.PCAP.PP.KD”, # GDP per capita, PPP (constant 2005 intl $)
“NV.AGR.TOTL.ZS”, # Agriculture, value added (% of GDP)
“NV.IND.TOTL.ZS”, # Industry, value added (% of GDP))
“SP.POP.GROW”, # Population growth (annual %)
“SP.POP.TOTL”, # Population, total
“SP.POP.TOTL.FE.ZS”, # Population, female (% of total)
“SP.URB.TOTL.IN.ZS”, # Urban population (% of total)
“SP.POP.BRTH.MF”, # Sex ratio at birth (females per 1,000 males)
“SP.DYN.LE00.IN”, # Life expectancy at birth, total (years)
“SP.DYN.LE00.FE.IN”, # Life expectancy at birth, female (years)
“SP.DYN.LE00.MA.IN”, # Life expectancy at birth, male (years),
“SP.DYN.SMAM.MA”, # Age at first marriage, male
“SP.DYN.SMAM.FE”, # Age at first marriage, female
“SP.DYN.IMRT.IN”, # Infant mortality rate
“SP.DYN.TFRT.IN” )# Fertility rate, total (births per woman)

# Extract latest version of desired variables from WDI.
# This takes a long time (5 minutes)
wdim <- WDI(country=”all”, indicator = wdilist, extra = TRUE, start = 1960, end = as.numeric(substr(Sys.Date(),1,4)))

# Rename the variables
wdim <- rename.vars(wdim, c(“NY.GDP.PCAP.PP.KD”,”SP.POP.TOTL”), c(“GDPpcUSDreal”,”population”))
wdim <- rename.vars(wdim, c(“SP.POP.TOTL.FE.ZS”,”SP.URB.TOTL.IN.ZS”), c(“femaleperc”,”urbanperc”))
wdim <- rename.vars(wdim, c(“SP.POP.BRTH.MF”,”SP.DYN.LE00.IN”), c(“sexratiobirth”,”lifeexp”))
wdim <- rename.vars(wdim, c(“SP.POP.GROW”), c(“popgrow”))
wdim <- rename.vars(wdim, c(“SP.DYN.LE00.FE.IN”,”SP.DYN.LE00.MA.IN”), c(“lifexpfem”,”lifeexpmale”))
wdim <- rename.vars(wdim, c(“SP.DYN.SMAM.MA”,”SP.DYN.SMAM.FE”), c(“smammale”,”smamfemale”))
wdim <- rename.vars(wdim, c(“NV.AGR.TOTL.ZS”,”NV.IND.TOTL.ZS”), c(“gdpagshare”,”gdpindshare”))
wdim <- rename.vars(wdim, c(“SP.DYN.IMRT.IN”,”SP.DYN.TFRT.IN”), c(“infmort”,”fertility”))

# make population be in units of millions; make GDPpc into log GDPpc
wdim$population = wdim$population/1000000
wdim$logGDPpc = log(wdim$GDPpcUSDreal)

# Check names
names(wdim)

# Tell R that dataframe wdi is the relevant one
attach(wdim)

# Take out the entries that are aggregates (eg East Asia) and not countries
wdim = subset(wdim, !( region==”Aggregates”) )
table(wdim$region)

# Table by region of % female, for two different years
summaryBy(femaleperc  ~ region,  data=subset(wdim, year==1985 ), FUN=c(mean, min, max),na.rm=TRUE)
summaryBy(femaleperc  ~ region,  data=subset(wdim, year==2010 ), FUN=c(mean, min, max),na.rm=TRUE)

# Assume normal percent female = .504
# Calculate missing women in each country = normal women – actual women
# = (.504/(1-.504))*(actual men) – femaleperc*population
wdim$fracti=(.504)/(1-.504)
wdim$actwomen=(.01)*wdim$femaleperc*wdim$population
wdim$missingw = (wdim$fracti)*((.01)*(100-wdim$femaleperc)*wdim$population) -wdim$actwomen

# Add up the missing women across all countries, by the region, by year
summaryBy(missingw[year==1960]  ~ region,  data=subset(wdim, year==1960 ), FUN=c(sum),na.rm=TRUE)
summaryBy(missingw[year==1985]  ~ region,  data=subset(wdim, year==1985 ), FUN=c(sum),na.rm=TRUE)
summaryBy(missingw[year==2010]  ~ region,  data=subset(wdim, year==2010 ), FUN=c(sum),na.rm=TRUE)

# Make a plot of GDP per capita and % of population that is female
with(subset(wdim, year==2010 ), plot(femaleperc ~ GDPpcUSDreal,  xlab=”GDP per capita”, ylab=”% female”,
ylim= c(46, 55), xlim=c(100,50000), col= “blue”,
main=”GDP per capita and % of population that is female”) )
with(subset(wdim, year==2010 ),abline(lm(femaleperc ~ GDPpcUSDreal ), col= “red”))

# Run four regressions
fit1 <-with(subset(wdim, year==2000 ), lm(femaleperc ~ logGDPpc + population))
fit2 <-with(subset(wdim, year==2000 ), lm(femaleperc ~ population+ infmort))
fit3 <-with(subset(wdim, year==2000 ), lm(femaleperc ~ population+ infmort+fertility))
fit4 <-with(subset(wdim, year==2000 ), lm(femaleperc ~ logGDPpc +fertility+region))

# Use mtable to display results
# This produces output in tab-delimited format:
# Notepad window opens up; The contents of this file can be pasted into Word or Excel
fitx <- mtable(“Model 1″=fit1,”Model 2″=fit2, “Model 3″=fit3, “Model 4″=fit4,
summary.stats=c(“R-squared”,”N”))
fitx <- relabel(fitx,
“(Intercept)” = “Constant”,
logGDPpc = “real log GDP per capita”,
population = “Total population”,
infmort = “Infant mortality rate”,
fertility = “Fertility rate”)
file123 <- “mtable123.txt”
write.mtable(fitx,file=file123)
file.show(file123)

detach(wdim)

About mkevane

Economist at Santa Clara University and Director of Friends of African Village Libraries.
This entry was posted in Politics. Bookmark the permalink.