library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)


getwd()
## [1] "/Users/tyan"
df <- read_delim("Downloads/ARTH_data.xlsx - Sheet1.csv")
## New names:
## Rows: 336 Columns: 24
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (22): Seattle, ART SITE, ARTIST, ART TITLE, ART TYPE, MATERIALS, MEASURE... dbl
## (2): LATITUDE, LONGITUDE
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...24`
dim(df)
## [1] 336  24
df
## # A tibble: 336 × 24
##    Seattle `ART SITE`       ARTIST `ART TITLE` `ART TYPE` MATERIALS MEASUREMENTS
##    <chr>   <chr>            <chr>  <chr>       <chr>      <chr>     <chr>       
##  1 Seattle Adams Elementar… Hamme… "\"Guardia… Painting   Gouache … 22 1/8 in x…
##  2 Seattle Adams Elementar… MacKa… "\"Sunflow… Photograph Silver d… 20 in x 24 …
##  3 Seattle Adams Elementar… MacKa… "\"Lisiant… Photograph Silver d… 20 in x 24 …
##  4 Seattle Arbor Heights E… Carl … "\"Puget S… Sculpture  Glass mo… 8 ft 4 in x…
##  5 Seattle Ballard High Sc… Katz,… "\"Safety … Sculpture  Painted … 34 in x 12 …
##  6 Seattle Ballard High Sc… Knigh… "\"Diva\""  Print      Serigrap… 28 in x 23 …
##  7 Seattle Ballard High Sc… Lawre… "\"Windows… Print      Offset l… 21 3/4 in x…
##  8 Seattle Ballard High Sc… Moila… "\"Olympic… Painting   Oil pain… 31 in 6 in …
##  9 Seattle Ballard High Sc… Moila… "\"Gillnet… Painting   Oil pain… 3 ft x 6 ft 
## 10 Seattle Bryn Mawr Eleme… Lawre… "\"Builder… Print      Offset l… 30 in x 21 …
## # ℹ 326 more rows
## # ℹ 17 more variables: `DATE MADE` <chr>, LOCATION <chr>, ADDRESS <chr>,
## #   `ZIP CODE` <chr>, `SEE MORE AT` <chr>, LATITUDE <dbl>, LONGITUDE <dbl>,
## #   GEOCOORDINATES <chr>, `ID NUMBER` <chr>, `CREDIT LINE` <chr>,
## #   AGE_RANGE <chr>, `ASSOCIATED GROUP/COMMUNITY` <chr>, `SITE TYPE` <chr>,
## #   `THEMES/SUBJECT` <chr>, `ADDITIONAL NOTES ON CONTENT` <chr>,
## #   `IS ARTIST LOCAL?` <chr>, ...24 <chr>
df_clean <- df %>%
  select(SiteType = `SITE TYPE`, Theme = `THEMES/SUBJECT`, IsLocal = `IS ARTIST LOCAL?`) %>%
  drop_na() %>%
  mutate(Theme = str_trim(Theme)) %>%
  separate_rows(Theme, sep = ",\\s*")  # separate on comma + optional space

# Count themes per site type
theme_counts <- df_clean %>%
  group_by(SiteType, Theme) %>%
  tally(name = "Count")

# Get most common theme per site type
most_common_themes <- theme_counts %>%
  group_by(SiteType) %>%
  filter(Count == max(Count))

# Bar graph: Site type on x-axis, fill by Theme
ggplot(most_common_themes, aes(x = reorder(SiteType, -Count), y = Count, fill = Theme)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Most Common Theme by Site Type",
       x = "Site Type",
       y = "Number of Artworks",
       fill = "Theme") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Clean and prepare data
df_local <- df %>%
  select(SiteType = `SITE TYPE`, Theme = `THEMES/SUBJECT`, IsLocal = `IS ARTIST LOCAL?`) %>%
  drop_na() %>%
  mutate(Theme = str_trim(Theme)) %>%
  separate_rows(Theme, sep = ",\\s*")  # separate on comma + optional space

# Count themes by local/non-local
theme_by_local <- df_local %>%
  group_by(IsLocal, Theme) %>%
  tally(name = "Count")

# Bar graph: Theme on x-axis, fill by Local/Non-local
ggplot(theme_by_local, aes(x = reorder(Theme, -Count), y = Count, fill = IsLocal)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(title = "Themes by Artist Locality",
       x = "Theme",
       y = "Number of Artworks",
       fill = "Is Local?") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

df_sep <- df %>%
  select(IsLocal = `IS ARTIST LOCAL?`, Theme = `THEMES/SUBJECT`) %>%
  drop_na() %>%
  mutate(Theme = str_trim(Theme)) %>%
  separate_rows(Theme, sep = ",\\s*")

# Convert to uppercase for consistent matching
df_sep <- df_sep %>%
  mutate(Theme = toupper(Theme))

# Count total artists in each group (local/non-local)
total_by_locality <- df_sep %>%
  distinct(IsLocal, row_number()) %>%  # approximate unique entries
  count(IsLocal, name = "TotalArtists")

# Count artists who did PNW-themed work
pnw_by_locality <- df_sep %>%
  filter(str_detect(Theme, "PNW")) %>%
  count(IsLocal, name = "PNW_Artists")

# Join and calculate percentage
pnw_percent <- left_join(pnw_by_locality, total_by_locality, by = "IsLocal") %>%
  mutate(PercentPNW = round(100 * PNW_Artists / TotalArtists, 1))

print(pnw_percent)
## # A tibble: 2 × 4
##   IsLocal PNW_Artists TotalArtists PercentPNW
##   <chr>         <int>        <int>      <dbl>
## 1 No               12          352        3.4
## 2 Yes              34          358        9.5