netflixraw <- read_csv("https://github.com/ygterl/EDA-Netflix-2020-in-R/raw/master/netflix_titles.csv")
## Rows: 6234 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): type, title, director, cast, country, date_added, rating, duration...
## dbl (2): show_id, release_year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Then we can review the data with following codes:glimpse and summary
library(dplyr)
glimpse(netflixraw)
## Rows: 6,234
## Columns: 12
## $ show_id <dbl> 81145628, 80117401, 70234439, 80058654, 80125979, 8016389…
## $ type <chr> "Movie", "Movie", "TV Show", "TV Show", "Movie", "TV Show…
## $ title <chr> "Norm of the North: King Sized Adventure", "Jandino: What…
## $ director <chr> "Richard Finn, Tim Maltby", NA, NA, NA, "Fernando Lebrija…
## $ cast <chr> "Alan Marriott, Andrew Toth, Brian Dobson, Cole Howard, J…
## $ country <chr> "United States, India, South Korea, China", "United Kingd…
## $ date_added <chr> "September 9, 2019", "September 9, 2016", "September 8, 2…
## $ release_year <dbl> 2019, 2016, 2013, 2016, 2017, 2016, 2014, 2017, 2017, 201…
## $ rating <chr> "TV-PG", "TV-MA", "TV-Y7-FV", "TV-Y7", "TV-14", "TV-MA", …
## $ duration <chr> "90 min", "94 min", "1 Season", "1 Season", "99 min", "1 …
## $ listed_in <chr> "Children & Family Movies, Comedies", "Stand-Up Comedy", …
## $ description <chr> "Before planning an awesome wedding for his grandfather, …
summary(netflixraw)
## show_id type title director
## Min. : 247747 Length:6234 Length:6234 Length:6234
## 1st Qu.:80035802 Class :character Class :character Class :character
## Median :80163367 Mode :character Mode :character Mode :character
## Mean :76703679
## 3rd Qu.:80244889
## Max. :81235729
## cast country date_added release_year
## Length:6234 Length:6234 Length:6234 Min. :1925
## Class :character Class :character Class :character 1st Qu.:2013
## Mode :character Mode :character Mode :character Median :2016
## Mean :2013
## 3rd Qu.:2018
## Max. :2020
## rating duration listed_in description
## Length:6234 Length:6234 Length:6234 Length:6234
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
netflix <- netflixraw %>% select(show_id,type,title,director,cast,rating,country,release_year,rating,duration,listed_in)
netflix <- distinct(netflix, title, country, type, release_year, .keep_all = TRUE)
summary(netflix)
## show_id type title director
## Min. : 247747 Length:6232 Length:6232 Length:6232
## 1st Qu.:80035688 Class :character Class :character Class :character
## Median :80163360 Mode :character Mode :character Mode :character
## Mean :76702412
## 3rd Qu.:80244866
## Max. :81235729
## cast rating country release_year
## Length:6232 Length:6232 Length:6232 Min. :1925
## Class :character Class :character Class :character 1st Qu.:2013
## Mode :character Mode :character Mode :character Median :2016
## Mean :2013
## 3rd Qu.:2018
## Max. :2020
## duration listed_in
## Length:6232 Length:6232
## Class :character Class :character
## Mode :character Mode :character
##
##
##
in here we seperate values with comma and accept them as different raws.
netflixallcountry<-netflix %>%
mutate(country = strsplit(as.character(country), ", ")) %>%
unnest(country)
netflixallcountry1<-netflixallcountry %>% group_by(country) %>% summarise(country_number = n())
netflixallcountry2<-netflixallcountry1[order(netflixallcountry1$country_number),,decreasing=TRUE]
netflixalllisted_in<-netflix %>%
mutate(listed_in = strsplit(as.character(listed_in), ", ")) %>%
unnest(listed_in)
netflixalllisted_in1<-netflixalllisted_in %>% group_by(listed_in) %>% summarise(listed_in_number = n())
netflixalllisted_in2<-netflixalllisted_in1[order(netflixalllisted_in1$listed_in),,decreasing=TRUE]
netflixalllisted_in1 <- filter(netflixalllisted_in2, listed_in_number>200)
Most created type of listed_in is below:
library(dplyr)
library(ggplot2)
ggplot(netflixalllisted_in1)+geom_point(aes(x = netflixalllisted_in1$listed_in, y = netflixalllisted_in1$listed_in_number, color=as.character(listed_in)))
## Warning: Use of `netflixalllisted_in1$listed_in` is discouraged. Use `listed_in`
## instead.
## Warning: Use of `netflixalllisted_in1$listed_in_number` is discouraged. Use
## `listed_in_number` instead.
While international movies is top of the list, dramas, comedies and international TV shows are follows the international TV shows.
As you can see below there are more tv show in Netflix in 2019
moviesandtvshows <- netflixraw %>%
filter(release_year >= 1980 & release_year < 2020) %>%
group_by(type, release_year) %>%
summarise(count = n()) %>%
arrange(desc(release_year)) %>%
ungroup()
## `summarise()` has grouped output by 'type'. You can override using the `.groups` argument.
moviesandtvshows
ggplot(moviesandtvshows)+geom_line(aes(x = moviesandtvshows$release_year, y = moviesandtvshows$count, color=as.character(type)))
## Warning: Use of `moviesandtvshows$release_year` is discouraged. Use
## `release_year` instead.
## Warning: Use of `moviesandtvshows$count` is discouraged. Use `count` instead.
Here is the top 20 artist that love to play in Netflix.
netflixartists<-netflix %>%
mutate(cast = strsplit(as.character(cast), ", ")) %>%
unnest(cast)
netflixartists1<-netflixartists %>% group_by(cast) %>% summarise(numberofplay = n())
netflixartists1 %>% top_n(20, numberofplay)