Week 4 - Data Analysis on Netflix dataset

Installing packages which will need in further sections

install.packages("htmltab", repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/b_/c89zxyfj37lf9gbhgtjj6t9r0000gn/T//RtmpCYG6kf/downloaded_packages

install.packages("lubridate", repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/b_/c89zxyfj37lf9gbhgtjj6t9r0000gn/T//RtmpCYG6kf/downloaded_packages

install.packages("plotly", repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/b_/c89zxyfj37lf9gbhgtjj6t9r0000gn/T//RtmpCYG6kf/downloaded_packages

install.packages("dplyr", repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/b_/c89zxyfj37lf9gbhgtjj6t9r0000gn/T//RtmpCYG6kf/downloaded_packages

install.packages("tm", repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/b_/c89zxyfj37lf9gbhgtjj6t9r0000gn/T//RtmpCYG6kf/downloaded_packages

install.packages("prettydoc", repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/b_/c89zxyfj37lf9gbhgtjj6t9r0000gn/T//RtmpCYG6kf/downloaded_packages

Activating packages

library("htmltab")
library("lubridate")

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library("plotly")

## Loading required package: ggplot2

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library("dplyr")

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library("tm")

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library("prettydoc")

Reading data files from github to local pc

urlfile<-'https://raw.githubusercontent.com/ygterl/EDA-Netflix-2020-in-R/master/netflix_titles.csv'
data<-read.csv(urlfile)

Create Explantion data table with plotly

explanation_table <- rbind(c('show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating' , 'duration', 'listed_in', 'description'), 
    c("Unique ID for movies", 
    "Movie or Tv Show ", 
    "Title of the Movie or TV Show", 
    "Director of the Movie /TV Show", 
    "List of Actors",
    "Where Movie or Tv show was produced",
    "Movie added date on Netflix",
    "Actual release year of Movie / TV Show",
    "Rating of Movie or TV Show",
    "Total Duration - in minutes or number of seasons",
    "Genre",
    "The summary"))

description_table <- plot_ly(
  type = 'table',
  columnorder = c(1,2),
  columnwidth = c(12,12),
  header = list(
    values = c('<b>VARIABLES</b><br>', '<b>DESCRIPTION</b>'),
    line = list(color = '#000000'),
    fill = list(color = '#fff2cc'),
    align = c('left','center'),
    font = list(color = 'black', size = 15),
    height = 40
  ),
  cells = list(
    values = explanation_table,
    line = list(color = '#506784'),
    fill = list(color = c('#C9DFEC', 'white')),
    align = c('left', 'left'),
    font = list(color = c('#000000'), size = 15),
    height = 30
    ))
description_table

Cleaning dataframe Deleting unnecessary variable “show_id” Changing date format Removing Na’s and duplicate datas from database Changing variable formats as factor

data <- data [-1]
data$date_added <- mdy(data$date_added)
data <- distinct(data ,director, title, country, type, release_year, .keep_all = TRUE)
data$type <- as.factor(data$type)

Total number of TV series and movies by year.

data_release <- data %>% group_by(release_year) %>% summarise(
  count = n())
summary(data_release)

##   release_year      count        
##  Min.   :1925   Min.   :   1.00  
##  1st Qu.:1967   1st Qu.:   3.00  
##  Median :1984   Median :   9.00  
##  Mean   :1983   Mean   :  86.56  
##  3rd Qu.:2002   3rd Qu.:  35.00  
##  Max.   :2020   Max.   :1062.00

Visualization of Total number of TV series and movies by year with GGplot 2 library; Y axis changed by log 10 base.

ggplot(data_release, aes(x=release_year, y=count)) + 
  ylim(0,max(data_release$count)) +
  ggtitle("Number of Movie and TV Show in time scale") +
  xlab("Year") + 
  ylab("Log10 Number of Movie and TV Show")+
  scale_x_continuous(limits = c(1925, 2020, 25)) +
  scale_y_log10()+
  geom_point(aes(color=count))

## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.

Which TV show and movie genres are the most in Netflix?

data_genre <- strsplit(data$listed_in, split = ", ")
data_genre <- data.frame(type = rep(data$type, sapply(data_genre, length)), genre = unlist(data_genre))
data_genre$genre <- as.character(data_genre$genre)

sum_of_genre <- na.omit(data_genre) %>%
  group_by(genre, type) %>%
  summarise(count = n())

## `summarise()` has grouped output by 'genre'. You can override using the `.groups` argument.

plot_ly(sum_of_genre, x= ~genre, y= ~sum_of_genre$count, type = "bar" )%>%
  layout(title = 'Genres on Netflix',font= 24, plot_bgcolor = "#e5ecf6", xaxis = list(title = 'Genres'), yaxis = list(title = 'Sum of Genres'))

netflix_assignmnt

ozgur_akbelen

11/3/2021