first_custom <- function() {
theme_minimal() +
theme(
axis.line = element_line(color = "gray0"),
plot.background = element_rect(fill = "grey95"),
panel.grid.major = element_line(linewidth = 0.5, color = "gray80"),
panel.grid.minor = element_blank(),
#panel.grid.minor = element_line(color = "gray85"),
plot.margin = margin(15, 15, 15 ,15),
plot.title = element_text(face = "bold"),
plot.title.position = "plot",
plot.caption = element_text(size = 8, face = "italic"),
plot.caption.position = "plot",
legend.background = element_rect(),
legend.justification = (c(1,0)),
legend.text = element_text(size = 8),
legend.title = element_text(size = 8),
legend.key.size = unit(0.25, "cm")
)
}
Taken from the GitHub repo
The TidyTuesday Project is a weekly data project aimed at the R ecosystem. As this project was born out of the R4DS Online Learning Community and the R for Data Science textbook, an emphasis was placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2, tidyr, dplyr, and other tools in the tidyverse ecosystem, but other packages are more than welcome.
Every week they post a raw dataset, a chart or article related to that dataset, and ask participants to explore the data. While the dataset will be “tamed”, it will not always be tidy! As such you might need to apply various R for Data Science techniques to wrangle the data into a true tidy format. For this lab, we won’t require you to do much cleaning. The goal of TidyTuesday is to apply your R skills, get feedback, explore other’s work, and connect with the greater #RStats community!
TidyTuesday is intended to be a safe and supportive environment.
The TidyTuesday Repo contains weekly datasets for the last four years. There is a lot of documentation on the repo, please check it out. If you’re looking for some inspiration, #TidyTuesday is a popular hashtag on Twitter and other platforms.
rainfall <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-07/rainfall.csv')
temperature <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-07/temperature.csv')
temperature <- temperature %>%
mutate(year = as.numeric(format(date, "%Y",)),
month = as.numeric(format(date, "%m")))
temperature$city_name = toupper(temperature$city_name)
rainfall$city_name = toupper(rainfall$city_name)
Rainfall and Temperature data of Australia https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-01-07/readme.md
library(viridis)
# Find the intersection of city names
common_cities <- intersect(temperature$city_name, rainfall$city_name)
# Filter both datasets to keep only the common cities
temperature <- temperature %>%
filter(city_name %in% common_cities)
rainfall <- rainfall %>%
filter(city_name %in% common_cities)
temperature <- temperature %>%
mutate(year = as.numeric(format(date, "%Y",)),
month = as.numeric(format(date, "%m")))
temperature$city_name = toupper(temperature$city_name)
rainfall$city_name = toupper(rainfall$city_name)
temp_recent_mean <- temperature %>%
filter(year >= 2000, year <= 2019) %>%
mutate(city_name = str_to_title(city_name)) %>%
filter(temp_type == "max") %>%
group_by(year, city_name) %>%
summarise(temperature = mean(temperature, na.rm = TRUE), .groups='drop')
ggplot(temp_recent_mean, aes(fill=city_name, x=year, y=temperature, color=city_name)) +
geom_line() +
facet_wrap(~city_name, nrow=3) +
scale_x_continuous(name="Year", breaks=seq(2000, 2019, 5)) +
scale_y_continuous(name="Max Temp in Celsius", breaks=seq(14, 30, 2)) +
ggtitle("Temperature Variation in Each City: 2000-2019") +
theme(legend.position="none")
rain_recent_mean <- rainfall %>%
filter(year >= 2000, year <= 2019) %>%
group_by(year, city_name) %>%
summarise(mean_rain = mean(rainfall, na.rm=TRUE), .groups='drop') %>%
arrange(desc(mean_rain))
ggplot(rain_recent_mean, aes(fill=city_name, x=year, y=mean_rain, color=city_name)) +
geom_line() +
geom_point() +
scale_x_continuous(name="Year", breaks=seq(2000, 2019, 5)) +
scale_y_continuous(name="Average Rain in mm", breaks=seq(1, 8, 1)) +
facet_wrap(~city_name) +
ggtitle("Average Rainfall in Cities: 2000-2019") +
theme(legend.position="none", axis.text.x=element_text(size=6))
## Tables
library(dplyr)
library(knitr)
# Find common cities
common_cities <- intersect(temperature$city_name, rainfall$city_name)
# Filter datasets to keep only common cities and specified years
temp_recent_common <- temperature %>%
filter(city_name %in% common_cities, year >= 2000, year <= 2019)
rain_recent_common <- rainfall %>%
filter(city_name %in% common_cities, year >= 2000, year <= 2019)
# Calculate temperature summary statistics
summary_stats_temp <- temp_recent_common %>%
group_by(city_name, year) %>%
summarize(
mean_temp = mean(temperature, na.rm = TRUE),
median_temp = median(temperature, na.rm = TRUE),
min_temp = min(temperature, na.rm = TRUE),
max_temp = max(temperature, na.rm = TRUE),
.groups = 'drop'
)
# Calculate rainfall summary statistics
summary_stats_rain <- rain_recent_common %>%
group_by(city_name, year) %>%
summarize(
mean_rain = mean(rainfall, na.rm = TRUE),
median_rain = median(rainfall, na.rm = TRUE),
percentile_75_rain = quantile(rainfall, 0.75, na.rm = TRUE),
total_rain = sum(rainfall, na.rm = TRUE),
.groups = 'drop'
)
# Output summary tables
kable(summary_stats_temp, "html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
scroll_box(width = "100%", height = "500px")
| city_name | year | mean_temp | median_temp | min_temp | max_temp |
|---|---|---|---|---|---|
| BRISBANE | 2000 | 20.08620 | 20.80 | 2.5 | 37.4 |
| BRISBANE | 2001 | 20.45110 | 21.60 | 2.6 | 36.7 |
| BRISBANE | 2002 | 20.46534 | 21.10 | 1.6 | 36.0 |
| BRISBANE | 2003 | 20.17521 | 21.00 | 1.8 | 35.1 |
| BRISBANE | 2004 | 20.63497 | 21.85 | 2.5 | 40.2 |
| BRISBANE | 2005 | 20.86644 | 21.60 | 4.1 | 35.4 |
| BRISBANE | 2006 | 20.46356 | 21.70 | 3.0 | 32.2 |
| BRISBANE | 2007 | 20.63767 | 21.40 | -0.1 | 34.2 |
| BRISBANE | 2008 | 20.07910 | 20.90 | 2.6 | 39.8 |
| BRISBANE | 2009 | 20.90932 | 21.70 | 3.4 | 34.8 |
| BRISBANE | 2010 | 20.53000 | 21.15 | 5.0 | 32.1 |
| BRISBANE | 2011 | 20.06027 | 21.15 | 3.2 | 33.5 |
| BRISBANE | 2012 | 20.23844 | 21.00 | 3.7 | 35.9 |
| BRISBANE | 2013 | 20.69411 | 21.35 | 4.1 | 32.4 |
| BRISBANE | 2014 | 20.95918 | 21.80 | 2.6 | 34.3 |
| BRISBANE | 2015 | 20.65151 | 21.60 | 4.5 | 36.3 |
| BRISBANE | 2016 | 21.18470 | 21.65 | 4.6 | 36.3 |
| BRISBANE | 2017 | 21.23452 | 22.30 | 4.3 | 34.9 |
| BRISBANE | 2018 | 20.77205 | 21.70 | 3.3 | 38.1 |
| BRISBANE | 2019 | 23.43079 | 23.50 | 5.2 | 34.0 |
| CANBERRA | 2000 | 13.05683 | 13.10 | -7.9 | 35.1 |
| CANBERRA | 2001 | 13.33278 | 13.90 | -6.8 | 39.9 |
| CANBERRA | 2002 | 13.56096 | 13.35 | -7.0 | 36.9 |
| CANBERRA | 2003 | 13.52452 | 13.40 | -5.7 | 39.8 |
| CANBERRA | 2004 | 13.76325 | 13.20 | -8.3 | 38.7 |
| CANBERRA | 2005 | 13.88137 | 13.55 | -5.4 | 39.4 |
| CANBERRA | 2006 | 13.97466 | 14.05 | -7.4 | 39.9 |
| CANBERRA | 2007 | 14.31397 | 13.60 | -6.1 | 40.5 |
| CANBERRA | 2008 | 13.24727 | 13.00 | -6.7 | 35.8 |
| CANBERRA | 2009 | 14.18411 | 13.20 | -7.5 | 40.0 |
| CANBERRA | 2010 | 13.54262 | 13.70 | -6.5 | 39.1 |
| CANBERRA | 2011 | 13.30836 | 14.00 | -8.0 | 37.5 |
| CANBERRA | 2012 | 12.84221 | 13.20 | -6.8 | 35.1 |
| CANBERRA | 2013 | 13.97411 | 13.70 | -6.0 | 41.6 |
| CANBERRA | 2014 | 14.08233 | 13.70 | -7.6 | 40.2 |
| CANBERRA | 2015 | 13.74609 | 13.50 | -7.0 | 37.0 |
| CANBERRA | 2016 | 14.39098 | 14.00 | -5.5 | 39.3 |
| CANBERRA | 2017 | 14.02428 | 13.80 | -8.7 | 41.6 |
| CANBERRA | 2018 | 14.43000 | 13.70 | -7.4 | 40.6 |
| CANBERRA | 2019 | 18.18775 | 17.40 | -2.7 | 41.6 |
| MELBOURNE | 2000 | 15.94877 | 15.10 | 1.8 | 38.9 |
| MELBOURNE | 2001 | 15.65863 | 14.75 | 3.6 | 40.9 |
| MELBOURNE | 2002 | 15.66014 | 14.90 | 1.2 | 36.7 |
| MELBOURNE | 2003 | 15.54137 | 14.90 | 1.4 | 43.9 |
| MELBOURNE | 2004 | 15.34071 | 14.40 | 1.4 | 39.7 |
| MELBOURNE | 2005 | 16.05630 | 15.20 | 1.9 | 42.7 |
| MELBOURNE | 2006 | 15.43247 | 14.70 | 2.2 | 42.2 |
| MELBOURNE | 2007 | 16.53178 | 15.50 | 2.0 | 40.9 |
| MELBOURNE | 2008 | 15.65451 | 14.70 | 2.3 | 41.0 |
| MELBOURNE | 2009 | 16.19822 | 15.30 | 1.8 | 45.9 |
| MELBOURNE | 2010 | 15.93580 | 15.10 | 3.0 | 43.4 |
| MELBOURNE | 2011 | 15.82593 | 15.20 | 2.5 | 39.5 |
| MELBOURNE | 2012 | 15.79302 | 14.80 | 2.1 | 39.7 |
| MELBOURNE | 2013 | 16.27411 | 15.60 | 1.3 | 40.9 |
| MELBOURNE | 2014 | 16.47233 | 15.45 | 1.2 | 43.4 |
| MELBOURNE | 2015 | 15.91562 | 14.70 | 0.6 | 41.2 |
| MELBOURNE | 2016 | 16.17637 | 15.50 | 3.0 | 42.2 |
| MELBOURNE | 2017 | 16.26849 | 15.40 | 0.8 | 37.8 |
| MELBOURNE | 2018 | 16.22466 | 15.30 | 0.8 | 41.7 |
| MELBOURNE | 2019 | 18.98510 | 18.05 | 6.2 | 42.8 |
| PERTH | 2000 | 18.75533 | 18.50 | 0.4 | 41.1 |
| PERTH | 2001 | 18.07329 | 18.00 | 0.5 | 40.1 |
| PERTH | 2002 | 18.62493 | 18.40 | 1.0 | 43.2 |
| PERTH | 2003 | 18.81836 | 18.20 | 1.2 | 43.8 |
| PERTH | 2004 | 18.59385 | 18.20 | 0.1 | 42.1 |
| PERTH | 2005 | 17.92027 | 17.65 | 0.8 | 42.6 |
| PERTH | 2006 | 18.58904 | 18.50 | -1.3 | 40.9 |
| PERTH | 2007 | 18.41027 | 18.20 | 2.5 | 44.5 |
| PERTH | 2008 | 18.57542 | 18.50 | 1.3 | 43.4 |
| PERTH | 2009 | 18.89589 | 17.80 | 0.9 | 43.0 |
| PERTH | 2010 | 18.91205 | 18.50 | -1.0 | 43.2 |
| PERTH | 2011 | 19.79918 | 19.25 | 0.2 | 41.8 |
| PERTH | 2012 | 19.14686 | 18.85 | -0.7 | 42.2 |
| PERTH | 2013 | 19.34315 | 18.95 | -0.3 | 41.1 |
| PERTH | 2014 | 19.21041 | 18.70 | 1.0 | 43.5 |
| PERTH | 2015 | 19.43534 | 18.55 | 0.1 | 44.2 |
| PERTH | 2016 | 18.20505 | 17.70 | 0.9 | 43.0 |
| PERTH | 2017 | 18.95603 | 18.10 | 1.0 | 43.3 |
| PERTH | 2018 | 18.77174 | 18.20 | 2.5 | 40.9 |
| PERTH | 2019 | 21.34636 | 21.15 | 1.5 | 42.9 |
| SYDNEY | 2000 | 18.66585 | 19.00 | 5.6 | 38.6 |
| SYDNEY | 2001 | 18.92712 | 19.15 | 5.8 | 35.8 |
| SYDNEY | 2002 | 18.85995 | 18.90 | 4.3 | 38.5 |
| SYDNEY | 2003 | 18.61479 | 19.10 | 4.9 | 39.6 |
| SYDNEY | 2004 | 19.08142 | 19.40 | 5.9 | 39.9 |
| SYDNEY | 2005 | 19.09877 | 19.40 | 5.1 | 39.2 |
| SYDNEY | 2006 | 19.05603 | 19.30 | 5.8 | 45.0 |
| SYDNEY | 2007 | 19.09699 | 19.40 | 3.7 | 39.5 |
| SYDNEY | 2008 | 18.34891 | 18.70 | 5.3 | 35.5 |
| SYDNEY | 2009 | 19.18459 | 19.60 | 5.9 | 41.7 |
| SYDNEY | 2010 | 18.99712 | 19.00 | 4.3 | 42.1 |
| SYDNEY | 2011 | 18.66575 | 18.80 | 5.5 | 41.5 |
| SYDNEY | 2012 | 18.51421 | 18.90 | 5.5 | 34.2 |
| SYDNEY | 2013 | 19.37041 | 19.90 | 7.0 | 45.8 |
| SYDNEY | 2014 | 19.27912 | 19.55 | 5.5 | 36.5 |
| SYDNEY | 2015 | 19.03712 | 19.30 | 5.0 | 40.9 |
| SYDNEY | 2016 | 19.63251 | 19.85 | 5.4 | 39.2 |
| SYDNEY | 2017 | 19.44507 | 19.85 | 5.4 | 39.4 |
| SYDNEY | 2018 | 19.14033 | 19.50 | 4.6 | 43.4 |
| SYDNEY | 2019 | 22.03046 | 22.15 | 7.8 | 39.6 |
kable(summary_stats_rain, "html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
scroll_box(width = "100%", height = "500px")
| city_name | year | mean_rain | median_rain | percentile_75_rain | total_rain |
|---|---|---|---|---|---|
| BRISBANE | 2000 | 1.9666667 | 0.0 | 0.600 | 743.4 |
| BRISBANE | 2001 | 2.9766162 | 0.0 | 0.400 | 2164.0 |
| BRISBANE | 2002 | 1.8958621 | 0.0 | 0.000 | 1374.5 |
| BRISBANE | 2003 | 2.3370474 | 0.0 | 0.200 | 1678.0 |
| BRISBANE | 2004 | 3.2303797 | 0.0 | 0.000 | 2296.8 |
| BRISBANE | 2005 | 2.0973050 | 0.0 | 0.200 | 1478.6 |
| BRISBANE | 2006 | 2.1753463 | 0.0 | 0.200 | 1570.6 |
| BRISBANE | 2007 | 1.9217391 | 0.0 | 0.400 | 1370.2 |
| BRISBANE | 2008 | 5.0038544 | 0.0 | 4.000 | 2336.8 |
| BRISBANE | 2009 | 4.6768018 | 0.0 | 2.200 | 2076.5 |
| BRISBANE | 2010 | 7.4293850 | 0.2 | 4.800 | 3261.5 |
| BRISBANE | 2011 | 4.7905312 | 0.0 | 3.600 | 2074.3 |
| BRISBANE | 2012 | 5.0582927 | 0.0 | 2.350 | 2073.9 |
| BRISBANE | 2013 | 4.8500000 | 0.0 | 2.200 | 2027.3 |
| BRISBANE | 2014 | 2.4940129 | 0.0 | 0.200 | 1541.3 |
| BRISBANE | 2015 | 5.1922727 | 0.0 | 0.400 | 2284.6 |
| BRISBANE | 2016 | 2.0925620 | 0.0 | 0.400 | 759.6 |
| BRISBANE | 2017 | 2.7191011 | 0.0 | 0.200 | 968.0 |
| BRISBANE | 2018 | 2.4689655 | 0.0 | 0.850 | 859.2 |
| BRISBANE | 2019 | 1.8033058 | 0.0 | 0.200 | 654.6 |
| CANBERRA | 2008 | 2.2940000 | 0.0 | 0.050 | 229.4 |
| CANBERRA | 2009 | 1.1819178 | 0.0 | 0.200 | 431.4 |
| CANBERRA | 2010 | 2.6290411 | 0.0 | 0.400 | 959.6 |
| CANBERRA | 2011 | 1.5870879 | 0.0 | 0.400 | 577.7 |
| CANBERRA | 2012 | 1.8950820 | 0.0 | 0.200 | 693.6 |
| CANBERRA | 2013 | 1.4624658 | 0.0 | 0.200 | 533.8 |
| CANBERRA | 2014 | 1.5663912 | 0.0 | 0.200 | 568.6 |
| CANBERRA | 2015 | 1.5113573 | 0.0 | 0.200 | 545.6 |
| CANBERRA | 2016 | 2.1535519 | 0.0 | 0.800 | 788.2 |
| CANBERRA | 2017 | 1.3388430 | 0.0 | 0.200 | 486.0 |
| CANBERRA | 2018 | 1.2967033 | 0.0 | 0.000 | 472.0 |
| CANBERRA | 2019 | 0.9824658 | 0.0 | 0.000 | 358.6 |
| MELBOURNE | 2000 | 1.6349398 | 0.0 | 0.600 | 542.8 |
| MELBOURNE | 2001 | 1.8050147 | 0.0 | 1.200 | 611.9 |
| MELBOURNE | 2002 | 1.2055249 | 0.0 | 0.775 | 436.4 |
| MELBOURNE | 2003 | 1.3473973 | 0.0 | 0.800 | 491.8 |
| MELBOURNE | 2004 | 1.6473829 | 0.0 | 0.900 | 598.0 |
| MELBOURNE | 2005 | 1.5087912 | 0.0 | 0.800 | 549.2 |
| MELBOURNE | 2006 | 1.0264264 | 0.0 | 0.400 | 341.8 |
| MELBOURNE | 2007 | 1.2233533 | 0.0 | 0.800 | 408.6 |
| MELBOURNE | 2008 | 1.1803279 | 0.0 | 0.600 | 432.0 |
| MELBOURNE | 2009 | 1.1369863 | 0.0 | 0.600 | 415.0 |
| MELBOURNE | 2010 | 1.9949580 | 0.0 | 1.000 | 712.2 |
| MELBOURNE | 2011 | 2.3900277 | 0.0 | 1.000 | 862.8 |
| MELBOURNE | 2012 | 1.6983516 | 0.0 | 1.600 | 618.2 |
| MELBOURNE | 2013 | 1.8327824 | 0.0 | 1.400 | 665.3 |
| MELBOURNE | 2014 | 1.3164835 | 0.0 | 0.800 | 479.2 |
| MELBOURNE | 2015 | 1.2922636 | 0.0 | 0.600 | 451.0 |
| MELBOURNE | 2016 | 1.9286932 | 0.0 | 1.450 | 678.9 |
| MELBOURNE | 2017 | 1.7706371 | 0.0 | 1.000 | 639.2 |
| MELBOURNE | 2018 | 1.4876033 | 0.0 | 0.600 | 540.0 |
| MELBOURNE | 2019 | 1.2196133 | 0.0 | 1.200 | 441.5 |
| PERTH | 2000 | 1.9900826 | 0.0 | 0.650 | 722.4 |
| PERTH | 2001 | 1.9284507 | 0.0 | 0.000 | 684.6 |
| PERTH | 2002 | 1.7083102 | 0.0 | 0.200 | 616.7 |
| PERTH | 2003 | 2.1250689 | 0.0 | 0.800 | 771.4 |
| PERTH | 2004 | 1.5684211 | 0.0 | 0.400 | 566.2 |
| PERTH | 2005 | 2.2229917 | 0.0 | 0.900 | 802.5 |
| PERTH | 2006 | 1.2200000 | 0.0 | 0.000 | 445.3 |
| PERTH | 2007 | 1.8549451 | 0.0 | 0.825 | 675.2 |
| PERTH | 2008 | 2.1204420 | 0.0 | 0.875 | 767.6 |
| PERTH | 2009 | 1.5241758 | 0.0 | 0.400 | 554.8 |
| PERTH | 2010 | 1.4082418 | 0.0 | 0.000 | 512.6 |
| PERTH | 2011 | 2.1866667 | 0.0 | 0.200 | 787.2 |
| PERTH | 2012 | 1.7879656 | 0.0 | 0.200 | 624.0 |
| PERTH | 2013 | 2.0713889 | 0.0 | 0.425 | 745.7 |
| PERTH | 2014 | 1.7834254 | 0.0 | 0.000 | 645.6 |
| PERTH | 2015 | 1.5380556 | 0.0 | 0.000 | 553.7 |
| PERTH | 2016 | 2.0519663 | 0.0 | 0.400 | 730.5 |
| PERTH | 2017 | 2.0490028 | 0.0 | 0.200 | 719.2 |
| PERTH | 2018 | 2.1263736 | 0.0 | 0.200 | 774.0 |
| PERTH | 2019 | 1.5082192 | 0.0 | 0.000 | 550.5 |
| SYDNEY | 2000 | 2.2448087 | 0.0 | 0.800 | 821.6 |
| SYDNEY | 2001 | 3.7227397 | 0.0 | 1.200 | 1358.8 |
| SYDNEY | 2002 | 2.3561644 | 0.0 | 0.400 | 860.0 |
| SYDNEY | 2003 | 3.2887671 | 0.0 | 1.200 | 1200.4 |
| SYDNEY | 2004 | 2.7191257 | 0.0 | 0.550 | 995.2 |
| SYDNEY | 2005 | 2.2356164 | 0.0 | 0.600 | 816.0 |
| SYDNEY | 2006 | 2.7232877 | 0.0 | 1.000 | 994.0 |
| SYDNEY | 2007 | 4.1073973 | 0.0 | 1.200 | 1499.2 |
| SYDNEY | 2008 | 2.9579235 | 0.0 | 1.800 | 1082.6 |
| SYDNEY | 2009 | 2.6197260 | 0.0 | 0.800 | 956.2 |
| SYDNEY | 2010 | 3.1878453 | 0.0 | 1.800 | 1154.0 |
| SYDNEY | 2011 | 3.7512329 | 0.0 | 2.000 | 1369.2 |
| SYDNEY | 2012 | 3.3158470 | 0.0 | 1.200 | 1213.6 |
| SYDNEY | 2013 | 3.6832877 | 0.0 | 0.600 | 1344.4 |
| SYDNEY | 2014 | 2.4853186 | 0.0 | 1.600 | 897.2 |
| SYDNEY | 2015 | 3.6663014 | 0.0 | 1.400 | 1338.2 |
| SYDNEY | 2016 | 3.7857923 | 0.0 | 0.750 | 1385.6 |
| SYDNEY | 2017 | 2.9630854 | 0.0 | 0.700 | 1075.6 |
| SYDNEY | 2018 | 2.7478022 | 0.0 | 0.800 | 1000.2 |
| SYDNEY | 2019 | 2.3336986 | 0.0 | 0.200 | 851.8 |
Insights from Datasets:
Upward Temperature Trend: There is a noticeable upward trend in temperatures, particularly in the later years of the dataset, with several cities showing higher average and median temperatures. This is consistent with the global trend of rising temperatures and could be indicative of climate change effects.
Seasonal Variability: The wide range between the minimum and maximum temperatures for each city within each year indicates strong seasonal variability. This is expected, but the range also shows the potential for extreme weather events, as seen with some very high maximum temperatures.
Rainfall Variability: The rainfall data exhibits considerable yearly variability within cities, with certain years showing peaks that could correspond to specific heavy rainfall events. This suggests that rainfall is not consistent year-to-year and is subject to larger climatic patterns or events.
Data Quality: Initially, the median rainfall values being zero indicated potential data issues or many zero-rainfall days. However, the 75th percentile values showed that significant rainfall did occur, indicating the data likely captured actual variations in rainfall.
Extreme Values: The dataset contains some extreme temperature values, both hot and cold, which could warrant further investigation to understand if they are outliers, errors, or true weather phenomena.
Consistency Across Years: While there is year-to-year variation, the temperature data generally show a consistent pattern over the two-decade span, with the recent years standing out for their higher values.
Timeseries is all about a repeating trend. Sometimes you don’t know that there is a recurring trend until you create the data as a timeseries. Here, we’ll go ahead and use ts() on the variables that, in your opinion, it would make sense to apply it to.
The frequency argument inside ts() dictates how many observations should be considered in each cycle of the recurring trend.
library(lubridate)
library(zoo)
rainfall$date <- as.Date(paste(rainfall$year, rainfall$month, rainfall$day, sep = "-"), "%Y-%m-%d")
temperature$date <- as.Date(temperature$date)
rainfall_filtered <- subset(rainfall, year >= 2000 & (year < 2019 | (year == 2019 & as.Date(paste(year, month, day, sep="-")) <= as.Date("2019-05-31"))))
temperature_filtered <- subset(temperature, year >= 2000 & (year < 2019 | (year == 2019 & date <= as.Date("2019-05-31"))))
rainfall_filtered %>%
summarise(across(everything(), ~sum(is.na(.))))
## # A tibble: 1 × 12
## station_code city_name year month day rainfall period quality lat long
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 3160 18974 3164 0 0
## # ℹ 2 more variables: station_name <int>, date <int>
temperature_filtered %>%
summarise(across(everything(), ~sum(is.na(.))))
## # A tibble: 1 × 7
## city_name date temperature temp_type site_name year month
## <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 34 0 0 0 0
numeric_columns <- sapply(rainfall_filtered, is.numeric)
rainfall_filtered[numeric_columns] <- lapply(rainfall_filtered[numeric_columns], function(x) {ifelse(is.na(x), 0, x)})
temperature_filtered <- na.omit(temperature_filtered)
yearly_rainfall_by_city <- aggregate(rainfall ~ city_name + year, data = rainfall_filtered, sum)
yearly_temperature_by_city <- aggregate(temperature ~ city_name + year, data = temperature_filtered, mean)
monthly_rainfall_by_city <- aggregate(rainfall ~ city_name + year + month, data = rainfall_filtered, sum)
monthly_temperature_by_city <- aggregate(temperature ~ city_name + year + month, data = temperature_filtered, mean)
rainfall_list <- split(yearly_rainfall_by_city, yearly_rainfall_by_city$city_name)
temperature_list <- split(yearly_temperature_by_city, yearly_temperature_by_city$city_name)
rainfall_ts_list <- lapply(rainfall_list, function(df) {
start_year <- ifelse(df$city_name[1] == "Canberra", 2008, min(df$year))
end_year <- max(df$year)
ts(df$rainfall, start=start_year, end=end_year, frequency=1)
})
temperature_ts_list <- lapply(temperature_list, function(df) {
ts(df$temperature, start=min(df$year), end=max(df$year), frequency=1)
})
monthly_rainfall_list <- split(monthly_rainfall_by_city, monthly_rainfall_by_city$city_name)
monthly_temperature_list <- split(monthly_temperature_by_city, monthly_temperature_by_city$city_name)
monthly_rainfall_ts_list <- lapply(monthly_rainfall_list, function(df) {
start_year <- ifelse(df$city_name[1] == "Canberra", 2008, min(df$year))
ts(df$rainfall, start = c(start_year, min(df$month[df$year == start_year])), frequency = 12)
})
monthly_temperature_ts_list <- lapply(monthly_temperature_list, function(df) {
ts(df$temperature, start = c(min(df$year), min(df$month)), frequency = 12)
})
# Function to convert a time series object to a data frame
convert_ts_to_df <- function(ts_obj, city_name, value_name) {
years <- as.integer(time(ts_obj))
values <- as.vector(ts_obj)
data.frame(city_name = city_name, year = years, value = values, stringsAsFactors = FALSE)
}
# Convert each time series in the list to a data frame and then combine them
rainfall_df <- do.call(rbind, lapply(names(rainfall_ts_list), function(city) {
convert_ts_to_df(rainfall_ts_list[[city]], city, "rainfall")
}))
temperature_df <- do.call(rbind, lapply(names(temperature_ts_list), function(city) {
convert_ts_to_df(temperature_ts_list[[city]], city, "temperature")
}))
library(stats)
library(tseries)
library(forecast)
library(tidyr)
adf_test_results_monthly_temperature <- lapply(monthly_temperature_ts_list, adf.test)
adf_test_results_monthly_temperature
## $BRISBANE
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = -0.33177, Lag order = 6, p-value = 0.9889
## alternative hypothesis: stationary
##
##
## $CANBERRA
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = -0.05211, Lag order = 6, p-value = 0.99
## alternative hypothesis: stationary
##
##
## $MELBOURNE
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = -0.056618, Lag order = 6, p-value = 0.99
## alternative hypothesis: stationary
##
##
## $PERTH
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = -0.067175, Lag order = 6, p-value = 0.99
## alternative hypothesis: stationary
##
##
## $SYDNEY
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = -0.16611, Lag order = 6, p-value = 0.99
## alternative hypothesis: stationary
adf_test_results_monthly_rainfall <- lapply(monthly_rainfall_ts_list, adf.test)
adf_test_results_monthly_rainfall
## $BRISBANE
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = -3.7515, Lag order = 6, p-value = 0.02218
## alternative hypothesis: stationary
##
##
## $CANBERRA
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = -4.0377, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
##
##
## $MELBOURNE
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = -6.1863, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
##
##
## $PERTH
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = -1.6024, Lag order = 6, p-value = 0.743
## alternative hypothesis: stationary
##
##
## $SYDNEY
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = -4.8967, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf_test_results_yearly_rainfall <- lapply(rainfall_ts_list, adf.test)
adf_test_results_yearly_rainfall
## $BRISBANE
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = -0.65474, Lag order = 2, p-value = 0.9621
## alternative hypothesis: stationary
##
##
## $CANBERRA
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = -1.2307, Lag order = 2, p-value = 0.8655
## alternative hypothesis: stationary
##
##
## $MELBOURNE
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = -1.5964, Lag order = 2, p-value = 0.7261
## alternative hypothesis: stationary
##
##
## $PERTH
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = -3.7968, Lag order = 2, p-value = 0.03594
## alternative hypothesis: stationary
##
##
## $SYDNEY
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = -1.4466, Lag order = 2, p-value = 0.7832
## alternative hypothesis: stationary
adf_test_results_yearly_temperature <- lapply(temperature_ts_list, adf.test)
adf_test_results_yearly_temperature
## $BRISBANE
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = 0.10806, Lag order = 2, p-value = 0.99
## alternative hypothesis: stationary
##
##
## $CANBERRA
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = 0.29541, Lag order = 2, p-value = 0.99
## alternative hypothesis: stationary
##
##
## $MELBOURNE
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = -1.3562, Lag order = 2, p-value = 0.8177
## alternative hypothesis: stationary
##
##
## $PERTH
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = -2.9611, Lag order = 2, p-value = 0.2063
## alternative hypothesis: stationary
##
##
## $SYDNEY
##
## Augmented Dickey-Fuller Test
##
## data: X[[i]]
## Dickey-Fuller = 0.14916, Lag order = 2, p-value = 0.99
## alternative hypothesis: stationary
Which relationships are evident? Are they a long-term trend or do they just exhibit seasonality?
Monthly Temperature Dataset (First Set of Results): Most cities (like Brisbane, Canberra, Kent, Melbourne) have high p-values (close to 0.99), suggesting non-stationarity, indicating the presence of trends in the monthly temperature data.
Monthly Rainfall Dataset (Second Set of Results): Some cities (like Brisbane, Canberra, Melbourne, Sydney) show low p-values (e.g., < 0.05), indicating stationarity and thus no significant trend. Other cities show high p-values, suggesting non-stationarity and potential trends.
Yearly Rainfall Dataset (Third Set of Results): Cities like Perth show low p-values, indicating stationarity and no significant long-term trend.Other cities have higher p-values, suggesting potential trends.
Yearly Temperature Dataset (Fourth Set of Results): Cities like Brisbane and Canberra show high p-values, indicating non-stationarity and possible trends. Other cities, like Perth, show lower p-values, suggesting no significant long-term trend.
common_cities <- intersect(names(temperature_ts_list), names(rainfall_ts_list))
# Rainfall plot
rainfall_df_common <- do.call(rbind, lapply(common_cities, function(city) {
convert_ts_to_df(rainfall_ts_list[[city]], city, "rainfall")
}))
ggplot(rainfall_df_common, aes(x = year, y = value, group = city_name, color = city_name)) +
geom_line(lwd=1.1) +
facet_wrap(~ city_name, scales = "free_x") +
labs(title = "Yearly Rainfall by City (2000-2019)", x = "Year", y = "Total Rainfall") +
first_custom() +
theme(legend.position = "none")
# Temperature plot
temperature_df_common <- do.call(rbind, lapply(common_cities, function(city) {
convert_ts_to_df(temperature_ts_list[[city]], city, "temperature")
}))
ggplot(temperature_df_common, aes(x = year, y = value, group = city_name, color = city_name)) +
geom_line(lwd=1.1) +
facet_wrap(~ city_name, scales = "free_x") +
labs(title = "Yearly Mean Temperature by City (2000-2019)", x = "Year", y = "Mean Temperature") +
first_custom() +
theme(legend.position = "none")
The plots of temperature and rainfall trends by city from 2000 to 2019 show:
Seasonal patterns are evident in both temperature and rainfall data. There’s a slight upward trend in temperatures for most cities, suggesting a warming pattern. Rainfall data is more variable with no clear long-term trend.
Predicting 100 years into the future is complex, but if current warming trends persist, temperatures are likely to continue rising. Rainfall predictions are less certain due to their variability.
Geospatial data can get very complicated very quickly with
There are powerful applications like ArcGIS that specialize in processing this data. But we will be just fine with ggplot2. This week’s TidyTuesday project happens to involve geospatial data!
map_data()
geom_sf()group =coord_quickmap()library(sf)
library(raster)
library(dplyr)
library(ggrepel)
library(viridis)
library(ggplot2)
library(rnaturalearth)
library(rnaturalearthhires)
# Load Australia shapefile
australia <- rnaturalearth::ne_states(geounit = "australia")
sp::plot(australia)
# Load and plot raster data
grid_raster <- raster("2019120120191231.grid")
plot(grid_raster)
climate <- as(grid_raster, "SpatialPixelsDataFrame")
climate <- as.data.frame(climate)
climate <- climate %>%
rename(temp = X2019120120191231)
# Select and clean Australia cities data
australia_cities <- rainfall %>%
dplyr::select(city_name, long, lat) %>%
distinct()
australia_cities <- australia_cities[-4,]
# Create the plot
tempdec <- ggplot() +
geom_tile(data = climate, aes(x = x, y = y, fill = temp)) +
geom_sf(data = australia, fill = NA, color = "#3E3E3E", size = 0.1) +
geom_point(data = australia_cities, aes(x = long, y = lat), color = "white") +
geom_text_repel(data = australia_cities, aes(x = long, y = lat, label = city_name), size = 3, color = "white") +
scale_fill_viridis_c(option = "B") +
coord_sf(crs = st_crs(australia)) +
theme_void() +
theme(
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.background = element_blank(),
axis.text = element_blank(),
axis.title = element_blank(),
plot.caption = element_text(vjust = 1.1, hjust = 1.3, size = 7),
legend.position = "right",
legend.justification = "center",
legend.box.just = "right"
) +
labs(
title = "Australia Climate",
subtitle = "Average High Temperatures for December 2019",
caption = "Australian Bureau of Meteorology"
)
# Plot the result
print(tempdec)
The plot is a thematic map depicting the average high temperatures across Australia for December 2019. It employs a gradient color scheme, where warmer colors represent higher temperatures, creating a visual representation of the heat distribution. Major cities are marked with white points, and labels provide clear identification, adding a spatial context to the temperature data.
Links