Some data in this lab comes from the OCS “Exploring CO2 emissions
across time” activity (https://www.opencasestudies.org/ocs-bp-co2-emissions/.
This dataset is available in the dasehr
package.
Additional data about climate change disasters can be found at “https://daseh.org/data/Yearly_CC_Disasters.csv”.
library(tidyverse)
library(dasehr)
Open the yearly_co2_emissions
dataset from the
dasehr
package and assign it to an object called
co2
. (You can also use read_csv()
from the
readr
package and download the dataset directly from the
daseh.org website: “https://daseh.org/data/Yearly_CO2_Emissions_1000_tonnes.csv”)
read_csv()
from the readr
package.read.csv()
.co2 <- yearly_co2_emissions
co2 <- read_csv("https://daseh.org/data/Yearly_CO2_Emissions_1000_tonnes.csv")
## Rows: 192 Columns: 265
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): country
## dbl (264): 1751, 1752, 1753, 1754, 1755, 1756, 1757, 1758, 1759, 1760, 1761,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Look at the column names using colnames
- do you notice
any patterns?
colnames(co2)
## [1] "country" "1751" "1752" "1753" "1754" "1755" "1756"
## [8] "1757" "1758" "1759" "1760" "1761" "1762" "1763"
## [15] "1764" "1765" "1766" "1767" "1768" "1769" "1770"
## [22] "1771" "1772" "1773" "1774" "1775" "1776" "1777"
## [29] "1778" "1779" "1780" "1781" "1782" "1783" "1784"
## [36] "1785" "1786" "1787" "1788" "1789" "1790" "1791"
## [43] "1792" "1793" "1794" "1795" "1796" "1797" "1798"
## [50] "1799" "1800" "1801" "1802" "1803" "1804" "1805"
## [57] "1806" "1807" "1808" "1809" "1810" "1811" "1812"
## [64] "1813" "1814" "1815" "1816" "1817" "1818" "1819"
## [71] "1820" "1821" "1822" "1823" "1824" "1825" "1826"
## [78] "1827" "1828" "1829" "1830" "1831" "1832" "1833"
## [85] "1834" "1835" "1836" "1837" "1838" "1839" "1840"
## [92] "1841" "1842" "1843" "1844" "1845" "1846" "1847"
## [99] "1848" "1849" "1850" "1851" "1852" "1853" "1854"
## [106] "1855" "1856" "1857" "1858" "1859" "1860" "1861"
## [113] "1862" "1863" "1864" "1865" "1866" "1867" "1868"
## [120] "1869" "1870" "1871" "1872" "1873" "1874" "1875"
## [127] "1876" "1877" "1878" "1879" "1880" "1881" "1882"
## [134] "1883" "1884" "1885" "1886" "1887" "1888" "1889"
## [141] "1890" "1891" "1892" "1893" "1894" "1895" "1896"
## [148] "1897" "1898" "1899" "1900" "1901" "1902" "1903"
## [155] "1904" "1905" "1906" "1907" "1908" "1909" "1910"
## [162] "1911" "1912" "1913" "1914" "1915" "1916" "1917"
## [169] "1918" "1919" "1920" "1921" "1922" "1923" "1924"
## [176] "1925" "1926" "1927" "1928" "1929" "1930" "1931"
## [183] "1932" "1933" "1934" "1935" "1936" "1937" "1938"
## [190] "1939" "1940" "1941" "1942" "1943" "1944" "1945"
## [197] "1946" "1947" "1948" "1949" "1950" "1951" "1952"
## [204] "1953" "1954" "1955" "1956" "1957" "1958" "1959"
## [211] "1960" "1961" "1962" "1963" "1964" "1965" "1966"
## [218] "1967" "1968" "1969" "1970" "1971" "1972" "1973"
## [225] "1974" "1975" "1976" "1977" "1978" "1979" "1980"
## [232] "1981" "1982" "1983" "1984" "1985" "1986" "1987"
## [239] "1988" "1989" "1990" "1991" "1992" "1993" "1994"
## [246] "1995" "1996" "1997" "1998" "1999" "2000" "2001"
## [253] "2002" "2003" "2004" "2005" "2006" "2007" "2008"
## [260] "2009" "2010" "2011" "2012" "2013" "2014"
# Most column names are years
Let’s rename the column “2011” in “co2” to “CO2_2011” using
rename
. Repeat this for the years 2012, 2013, and 2014.
Make sure to reassign to co2
here and in subsequent
steps.
# General format
new_data <- old_data %>% rename(newname = oldname)
co2 <- co2 %>% rename(
CO2_2011 = `2011`,
CO2_2012 = `2012`,
CO2_2013 = `2013`,
CO2_2014 = `2014`
)
Select only the columns “country”, and those that start with “CO2_”.
Use select
and starts_with("CO2_")
.
# General format
new_data <- old_data %>% select(colname1, colname2, ...)
co2 <- co2 %>% select(country, starts_with("CO2_"))
Create a new dataset “co2_long” that does pivot_longer()
on all columns except “country”. Remember that !country
means all columns except “country”.
# General format
new_data <- old_data %>% pivot_longer(cols = colname(s))
co2_long <- co2 %>% pivot_longer(cols = !country)
Using co2_long
, filter the “country” column so it only
includes values from Indonesia and Canada. Hint: use
filter
and %in%
.
# General format
new_data <- old_data %>% filter(colname %in% c(...))
co2_long <- co2_long %>%
filter(country %in% c("Indonesia", "Canada"))
Use pivot_wider
to reshape “co2_long”. Use “county” for
the names_from
argument. Use “value” for the
values_from
argument. Call this new data
co2_wide
. Look at the data. How do these years compare to
one another?
# General format
new_data <- old_data %>% pivot_wider(names_from = column1, values_from = column2)
co2_wide <- co2_long %>%
pivot_wider(
names_from = country,
values_from = value
)
co2_wide
## # A tibble: 4 × 3
## name Canada Indonesia
## <chr> <dbl> <dbl>
## 1 CO2_2011 537000 604000
## 2 CO2_2012 517000 637000
## 3 CO2_2013 517000 490000
## 4 CO2_2014 537000 464000
Take the code from Questions 1.1 and 1.3-1.7. Chain all of this code
together using the pipe %>%
. Call your data
co2_compare
.
co2_compare <-
yearly_co2_emissions %>%
rename(
CO2_2011 = `2011`,
CO2_2012 = `2012`,
CO2_2013 = `2013`,
CO2_2014 = `2014`
) %>% select(country, starts_with("CO2_")) %>%
pivot_longer(cols = !country) %>%
filter(country %in% c("Indonesia", "Canada")) %>%
pivot_wider(names_from = country, values_from = value)
co2_compare
## # A tibble: 4 × 3
## name Canada Indonesia
## <chr> <dbl> <dbl>
## 1 CO2_2011 537000 604000
## 2 CO2_2012 517000 637000
## 3 CO2_2013 517000 490000
## 4 CO2_2014 537000 464000
Modify the code from Question P.1:
co2_compare2
co2_compare2 <-
yearly_co2_emissions %>%
rename(
CO2_1950 = `1950`,
CO2_1960 = `1960`,
CO2_1970 = `1970`,
CO2_1980 = `1980`
) %>% select(country, starts_with("CO2_")) %>%
pivot_longer(cols = !country) %>%
filter(country %in% c("Brazil", "Mexico")) %>%
pivot_wider(names_from = country, values_from = value)
co2_compare2
## # A tibble: 4 × 3
## name Brazil Mexico
## <chr> <dbl> <dbl>
## 1 CO2_1950 19700 30500
## 2 CO2_1960 46900 63100
## 3 CO2_1970 93800 114000
## 4 CO2_1980 187000 268000
Open the Yearly_CC_Disasters
dataset using the url
below. Save the dataset as an object called “cc”. rename the column
“Country” to “country”.
“https://daseh.org/data/Yearly_CC_Disasters.csv”
cc <- read_csv("https://daseh.org/data/Yearly_CC_Disasters.csv") %>%
rename(country = Country)
## Rows: 970 Columns: 53
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Country, ISO2, ISO3, Indicator, Unit, Source, CTS Code, CTS Name, ...
## dbl (44): ObjectId, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 19...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Join the datasets “co2” and “cc” by “country”. Keep all rows, even if there is no match. How many rows are there?
# General format
full_join(data1, data2, by = "")
full_join(co2, cc, by = "country")
## # A tibble: 1,032 × 57
## country CO2_2011 CO2_2012 CO2_2013 CO2_2014 ObjectId ISO2 ISO3 Indicator
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 Afghanist… 12200 10800 10000 9810 NA <NA> <NA> <NA>
## 2 Albania 5240 4910 5060 5720 8 AL ALB Climate …
## 3 Albania 5240 4910 5060 5720 9 AL ALB Climate …
## 4 Albania 5240 4910 5060 5720 10 AL ALB Climate …
## 5 Albania 5240 4910 5060 5720 11 AL ALB Climate …
## 6 Albania 5240 4910 5060 5720 12 AL ALB Climate …
## 7 Albania 5240 4910 5060 5720 13 AL ALB Climate …
## 8 Albania 5240 4910 5060 5720 14 AL ALB Climate …
## 9 Algeria 121000 130000 134000 145000 15 DZ DZA Climate …
## 10 Algeria 121000 130000 134000 145000 16 DZ DZA Climate …
## # ℹ 1,022 more rows
## # ℹ 48 more variables: Unit <chr>, Source <chr>, `CTS Code` <chr>,
## # `CTS Name` <chr>, `CTS Full Descriptor` <chr>, `1980` <dbl>, `1981` <dbl>,
## # `1982` <dbl>, `1983` <dbl>, `1984` <dbl>, `1985` <dbl>, `1986` <dbl>,
## # `1987` <dbl>, `1988` <dbl>, `1989` <dbl>, `1990` <dbl>, `1991` <dbl>,
## # `1992` <dbl>, `1993` <dbl>, `1994` <dbl>, `1995` <dbl>, `1996` <dbl>,
## # `1997` <dbl>, `1998` <dbl>, `1999` <dbl>, `2000` <dbl>, `2001` <dbl>, …
# 1,032 rows
Join the datasets “co2” and “cc” by “country”. Keep only rows that match both datasets. How many rows are there?
# General format
inner_join(data1, data2, by = "")
inner_join(co2, cc, by = "country")
## # A tibble: 628 × 57
## country CO2_2011 CO2_2012 CO2_2013 CO2_2014 ObjectId ISO2 ISO3 Indicator
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 Albania 5240 4910 5060 5720 8 AL ALB Climate rel…
## 2 Albania 5240 4910 5060 5720 9 AL ALB Climate rel…
## 3 Albania 5240 4910 5060 5720 10 AL ALB Climate rel…
## 4 Albania 5240 4910 5060 5720 11 AL ALB Climate rel…
## 5 Albania 5240 4910 5060 5720 12 AL ALB Climate rel…
## 6 Albania 5240 4910 5060 5720 13 AL ALB Climate rel…
## 7 Albania 5240 4910 5060 5720 14 AL ALB Climate rel…
## 8 Algeria 121000 130000 134000 145000 15 DZ DZA Climate rel…
## 9 Algeria 121000 130000 134000 145000 16 DZ DZA Climate rel…
## 10 Algeria 121000 130000 134000 145000 17 DZ DZA Climate rel…
## # ℹ 618 more rows
## # ℹ 48 more variables: Unit <chr>, Source <chr>, `CTS Code` <chr>,
## # `CTS Name` <chr>, `CTS Full Descriptor` <chr>, `1980` <dbl>, `1981` <dbl>,
## # `1982` <dbl>, `1983` <dbl>, `1984` <dbl>, `1985` <dbl>, `1986` <dbl>,
## # `1987` <dbl>, `1988` <dbl>, `1989` <dbl>, `1990` <dbl>, `1991` <dbl>,
## # `1992` <dbl>, `1993` <dbl>, `1994` <dbl>, `1995` <dbl>, `1996` <dbl>,
## # `1997` <dbl>, `1998` <dbl>, `1999` <dbl>, `2000` <dbl>, `2001` <dbl>, …
# 628 rows
What countries are present in “co2” that are not present in “cc”? Use
anti_join
. What countries are present in “cc” that are not
present in “co2”?
# General format
anti_join(data1, data2, by = "") %>% select(index)
anti_join(co2, cc, by = "country") %>% select(country)
## # A tibble: 62 × 1
## country
## <chr>
## 1 Afghanistan
## 2 Andorra
## 3 Armenia
## 4 Azerbaijan
## 5 Bahamas
## 6 Bahrain
## 7 Belarus
## 8 Brunei
## 9 Cape Verde
## 10 Central African Republic
## # ℹ 52 more rows
anti_join(cc, co2, by = "country") %>% select(country) %>% distinct()
## # A tibble: 85 × 1
## country
## <chr>
## 1 Afghanistan, Islamic Rep. of
## 2 American Samoa
## 3 Anguilla
## 4 Armenia, Rep. of
## 5 Azerbaijan, Rep. of
## 6 Azores Island
## 7 Bahamas, The
## 8 Belarus, Rep. of
## 9 Bermuda
## 10 British Virgin Islands
## # ℹ 75 more rows
Take the code from 2.2 and save the output as an object “co2_cc”. Filter the dataset. Filter so that you only keep Indonesia and Canada.
co2_cc <- full_join(co2, cc, by = "country") %>%
filter(country %in% c("Indonesia", "Canada"))
Select:
2014
)Rename:
2014
Reassign to “co2_cc”.
co2_cc <- co2_cc %>% select(country, CO2_2014, Indicator, `2014`) %>%
rename(emissions = CO2_2014, disasters = `2014`)
Use stringr
to trim the piece of text, “Climate related
disasters frequency, Number of Disasters:”, from the “Indicator” column.
You will use the function str_remove()
to do this. It works
similarly to other stringr
functions. Try to intuit how it
works by using the documentation page (?str_remove
).
library(stringr)
co2_cc <- co2_cc %>% mutate(Indicator = str_remove(
Indicator,
"Climate related disasters frequency, Number of Disasters: "
))
Pivot the dataset so that there are columns for country, emissions, and a column for each “Indicator”.
co2_cc %>% pivot_wider(
names_from = Indicator,
values_from = disasters
)
## # A tibble: 2 × 9
## country emissions Drought `Extreme temperature` Flood Storm TOTAL Wildfire
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Canada 537000 NA 1 1 NA 2 NA
## 2 Indonesia 464000 NA NA 6 NA 8 NA
## # ℹ 1 more variable: Landslide <dbl>