READING DATA

netflixraw <- read_csv("https://github.com/ygterl/EDA-Netflix-2020-in-R/raw/master/netflix_titles.csv")
## Rows: 6234 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): type, title, director, cast, country, date_added, rating, duration...
## dbl  (2): show_id, release_year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Understanding DATA

Then we can review the data with following codes:glimpse and summary

library(dplyr)
glimpse(netflixraw)
## Rows: 6,234
## Columns: 12
## $ show_id      <dbl> 81145628, 80117401, 70234439, 80058654, 80125979, 8016389…
## $ type         <chr> "Movie", "Movie", "TV Show", "TV Show", "Movie", "TV Show…
## $ title        <chr> "Norm of the North: King Sized Adventure", "Jandino: What…
## $ director     <chr> "Richard Finn, Tim Maltby", NA, NA, NA, "Fernando Lebrija…
## $ cast         <chr> "Alan Marriott, Andrew Toth, Brian Dobson, Cole Howard, J…
## $ country      <chr> "United States, India, South Korea, China", "United Kingd…
## $ date_added   <chr> "September 9, 2019", "September 9, 2016", "September 8, 2…
## $ release_year <dbl> 2019, 2016, 2013, 2016, 2017, 2016, 2014, 2017, 2017, 201…
## $ rating       <chr> "TV-PG", "TV-MA", "TV-Y7-FV", "TV-Y7", "TV-14", "TV-MA", …
## $ duration     <chr> "90 min", "94 min", "1 Season", "1 Season", "99 min", "1 …
## $ listed_in    <chr> "Children & Family Movies, Comedies", "Stand-Up Comedy", …
## $ description  <chr> "Before planning an awesome wedding for his grandfather, …
summary(netflixraw)
##     show_id             type              title             director        
##  Min.   :  247747   Length:6234        Length:6234        Length:6234       
##  1st Qu.:80035802   Class :character   Class :character   Class :character  
##  Median :80163367   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :76703679                                                           
##  3rd Qu.:80244889                                                           
##  Max.   :81235729                                                           
##      cast             country           date_added         release_year 
##  Length:6234        Length:6234        Length:6234        Min.   :1925  
##  Class :character   Class :character   Class :character   1st Qu.:2013  
##  Mode  :character   Mode  :character   Mode  :character   Median :2016  
##                                                           Mean   :2013  
##                                                           3rd Qu.:2018  
##                                                           Max.   :2020  
##     rating            duration          listed_in         description       
##  Length:6234        Length:6234        Length:6234        Length:6234       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
## 
netflix <- netflixraw %>% select(show_id,type,title,director,cast,rating,country,release_year,rating,duration,listed_in)
netflix <- distinct(netflix, title, country, type, release_year, .keep_all = TRUE)

summary(netflix)
##     show_id             type              title             director        
##  Min.   :  247747   Length:6232        Length:6232        Length:6232       
##  1st Qu.:80035688   Class :character   Class :character   Class :character  
##  Median :80163360   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :76702412                                                           
##  3rd Qu.:80244866                                                           
##  Max.   :81235729                                                           
##      cast              rating            country           release_year 
##  Length:6232        Length:6232        Length:6232        Min.   :1925  
##  Class :character   Class :character   Class :character   1st Qu.:2013  
##  Mode  :character   Mode  :character   Mode  :character   Median :2016  
##                                                           Mean   :2013  
##                                                           3rd Qu.:2018  
##                                                           Max.   :2020  
##    duration          listed_in        
##  Length:6232        Length:6232       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

in here we seperate values with comma and accept them as different raws.

netflixallcountry<-netflix %>% 
    mutate(country = strsplit(as.character(country), ", ")) %>% 
    unnest(country)

netflixallcountry1<-netflixallcountry %>% group_by(country) %>% summarise(country_number = n())

netflixallcountry2<-netflixallcountry1[order(netflixallcountry1$country_number),,decreasing=TRUE]

netflixalllisted_in<-netflix %>% 
    mutate(listed_in = strsplit(as.character(listed_in), ", ")) %>% 
    unnest(listed_in)

netflixalllisted_in1<-netflixalllisted_in %>% group_by(listed_in) %>% summarise(listed_in_number = n())

netflixalllisted_in2<-netflixalllisted_in1[order(netflixalllisted_in1$listed_in),,decreasing=TRUE]

netflixalllisted_in1 <- filter(netflixalllisted_in2, listed_in_number>200)

Most created type of listed_in is below:

library(dplyr)
library(ggplot2)


  ggplot(netflixalllisted_in1)+geom_point(aes(x = netflixalllisted_in1$listed_in, y = netflixalllisted_in1$listed_in_number, color=as.character(listed_in)))
## Warning: Use of `netflixalllisted_in1$listed_in` is discouraged. Use `listed_in`
## instead.
## Warning: Use of `netflixalllisted_in1$listed_in_number` is discouraged. Use
## `listed_in_number` instead.

While international movies is top of the list, dramas, comedies and international TV shows are follows the international TV shows.

As you can see below there are more tv show in Netflix in 2019

moviesandtvshows <- netflixraw %>%
  filter(release_year >= 1980 & release_year < 2020) %>%
  group_by(type, release_year) %>%
  summarise(count = n()) %>%
  arrange(desc(release_year)) %>%
  ungroup()
## `summarise()` has grouped output by 'type'. You can override using the `.groups` argument.
moviesandtvshows
  ggplot(moviesandtvshows)+geom_line(aes(x = moviesandtvshows$release_year, y = moviesandtvshows$count, color=as.character(type)))
## Warning: Use of `moviesandtvshows$release_year` is discouraged. Use
## `release_year` instead.
## Warning: Use of `moviesandtvshows$count` is discouraged. Use `count` instead.

Here is the top 20 artist that love to play in Netflix.

netflixartists<-netflix %>% 
    mutate(cast = strsplit(as.character(cast), ", ")) %>% 
    unnest(cast)

netflixartists1<-netflixartists %>% group_by(cast) %>% summarise(numberofplay = n())
 
netflixartists1 %>% top_n(20, numberofplay)