|
############################################################################ |
|
# AUTHOR: John Horton |
|
# PURPOSE: Extract by-country minimum hourly wages from the Wikipedia page |
|
# LAST MODIFIED: May 22, 2013 |
|
############################################################################ |
|
|
|
library(XML) |
|
library(ggplot2) |
|
library(scales) |
|
|
|
url <- "http://en.wikipedia.org/wiki/List_of_minimum_wages_by_country" |
|
raw <- readHTMLTable(doc=url) |
|
|
|
df.raw <- raw[[2]] |
|
colnames(df.raw) <- c("country", "minimum_wage", "annual", "annual_ppp", "workweek", |
|
"hourly_usd", "hourly_intl", "perc_2011_gdp", |
|
"effective") |
|
|
|
Clean.Wage <- Vectorize(function(x){ |
|
"There is some HTML junk & idiosyncracies in the wages that this fixed" |
|
x1 <- gsub("USYou can't use 'macro parameter character #' in math mode prefix |
|
x2 <- substring(x1, first = 20) #get rid of span meta-data that XML picks up |
|
as.numeric(x2) |
|
}) |
|
|
|
Clean.Country <- Vectorize(function(x) { |
|
"Country has a '_' appended to it--this strips it out" |
|
substring(x, first = 2) |
|
}) |
|
|
|
df <- with(df.raw, data.frame(country = Clean.Country(country), min.wage = Clean.Wage(hourly_usd))) |
|
|
|
Make.MW.plot <- function(df, label){ |
|
"This makes a minimum wage plot per country based on a passed data frame. |
|
There are a large number of countries, so the idea here is to split the |
|
data into subsets & plot them individually. |
|
" |
|
title = paste("Hourly minimum wages by country \n", label, sep = "") |
|
qplot(country, min.wage, data = df) + |
|
ylab("Hourly Wage (USD) \n \n Source: Wikipedia, May, 21, 2013 |
|
en.wikipedia.org/wiki/List_of_minimum_wages_by_country") + |
|
xlab("") + |
|
scale_y_continuous(labels = dollar) + coord_flip() + |
|
theme_bw() + expand_limits(y = 0) + |
|
ggtitle(title) |
|
} |
|
|
|
# Split countries into quartiles |
|
## > summary(df$min.wage) |
|
## Min. 1st Qu. Median Mean 3rd Qu. Max. |
|
## 0.030 0.490 1.180 2.308 2.460 16.450 |
|
|
|
g.75 <- Make.MW.plot(subset(df, min.wage > 2.5), label = "> 2.50") |
|
g.50 <- Make.MW.plot(subset(df, min.wage > 1.25 & min.wage <= 2.50), label = "1.25 < Minimum Wage <= 2.50") |
|
g.25 <- Make.MW.plot(subset(df, min.wage > 0.50 & min.wage <= 1.25), label = "0.50 < Minimum Wage <= 1.25") |
|
g.0 <- Make.MW.plot(subset(df, min.wage <= 0.50), label = "Minimum Wage <= 0.50") |
|
|
|
Write.Image <- function(filename, g, width = 500, height = 500, format = "png"){ |
|
"Writes a passed ggplot, g, to the filename. The default format is png." |
|
do.call(format, list(filename, width, height)) |
|
print(g) |
|
dev.off() |
|
} |
|
|
|
Write.Image("./minimum_wage_plots/quartile_75.png", g.75) |
|
Write.Image("./minimum_wage_plots/quartile_50.png", g.50) |
|
Write.Image("./minimum_wage_plots/quartile_25.png", g.25) |
|
Write.Image("./minimum_wage_plots/quartile_0.png", g.0) |
|
|
|
# Bonus plot - kernel density estimate of distribution of hourly minimums |
|
g.distro <- qplot(min.wage, geom="density", data = df) + |
|
scale_x_log10(labels = dollar) + |
|
xlab("Hourly minimum wages in USD, log scale \n Source: Wikipedia, May, 21, 2013 |
|
en.wikipedia.org/wiki/List_of_minimum_wages_by_country") + |
|
theme_bw() |
|
|
|
Write.Image("./minimum_wage_plots/distr.png", g.distro) |