Installing packages which will need in further sections
install.packages("htmltab", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/b_/c89zxyfj37lf9gbhgtjj6t9r0000gn/T//RtmpCYG6kf/downloaded_packages
install.packages("lubridate", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/b_/c89zxyfj37lf9gbhgtjj6t9r0000gn/T//RtmpCYG6kf/downloaded_packages
install.packages("plotly", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/b_/c89zxyfj37lf9gbhgtjj6t9r0000gn/T//RtmpCYG6kf/downloaded_packages
install.packages("dplyr", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/b_/c89zxyfj37lf9gbhgtjj6t9r0000gn/T//RtmpCYG6kf/downloaded_packages
install.packages("tm", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/b_/c89zxyfj37lf9gbhgtjj6t9r0000gn/T//RtmpCYG6kf/downloaded_packages
install.packages("prettydoc", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/b_/c89zxyfj37lf9gbhgtjj6t9r0000gn/T//RtmpCYG6kf/downloaded_packages
Activating packages
library("htmltab")
library("lubridate")
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library("plotly")
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("tm")
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library("prettydoc")
Reading data files from github to local pc
urlfile<-'https://raw.githubusercontent.com/ygterl/EDA-Netflix-2020-in-R/master/netflix_titles.csv'
data<-read.csv(urlfile)
Create Explantion data table with plotly
explanation_table <- rbind(c('show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating' , 'duration', 'listed_in', 'description'),
c("Unique ID for movies",
"Movie or Tv Show ",
"Title of the Movie or TV Show",
"Director of the Movie /TV Show",
"List of Actors",
"Where Movie or Tv show was produced",
"Movie added date on Netflix",
"Actual release year of Movie / TV Show",
"Rating of Movie or TV Show",
"Total Duration - in minutes or number of seasons",
"Genre",
"The summary"))
description_table <- plot_ly(
type = 'table',
columnorder = c(1,2),
columnwidth = c(12,12),
header = list(
values = c('<b>VARIABLES</b><br>', '<b>DESCRIPTION</b>'),
line = list(color = '#000000'),
fill = list(color = '#fff2cc'),
align = c('left','center'),
font = list(color = 'black', size = 15),
height = 40
),
cells = list(
values = explanation_table,
line = list(color = '#506784'),
fill = list(color = c('#C9DFEC', 'white')),
align = c('left', 'left'),
font = list(color = c('#000000'), size = 15),
height = 30
))
description_table
Cleaning dataframe Deleting unnecessary variable “show_id” Changing date format Removing Na’s and duplicate datas from database Changing variable formats as factor
data <- data [-1]
data$date_added <- mdy(data$date_added)
data <- distinct(data ,director, title, country, type, release_year, .keep_all = TRUE)
data$type <- as.factor(data$type)
Total number of TV series and movies by year.
data_release <- data %>% group_by(release_year) %>% summarise(
count = n())
summary(data_release)
## release_year count
## Min. :1925 Min. : 1.00
## 1st Qu.:1967 1st Qu.: 3.00
## Median :1984 Median : 9.00
## Mean :1983 Mean : 86.56
## 3rd Qu.:2002 3rd Qu.: 35.00
## Max. :2020 Max. :1062.00
Visualization of Total number of TV series and movies by year with GGplot 2 library; Y axis changed by log 10 base.
ggplot(data_release, aes(x=release_year, y=count)) +
ylim(0,max(data_release$count)) +
ggtitle("Number of Movie and TV Show in time scale") +
xlab("Year") +
ylab("Log10 Number of Movie and TV Show")+
scale_x_continuous(limits = c(1925, 2020, 25)) +
scale_y_log10()+
geom_point(aes(color=count))
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
Which TV show and movie genres are the most in Netflix?
data_genre <- strsplit(data$listed_in, split = ", ")
data_genre <- data.frame(type = rep(data$type, sapply(data_genre, length)), genre = unlist(data_genre))
data_genre$genre <- as.character(data_genre$genre)
sum_of_genre <- na.omit(data_genre) %>%
group_by(genre, type) %>%
summarise(count = n())
## `summarise()` has grouped output by 'genre'. You can override using the `.groups` argument.
plot_ly(sum_of_genre, x= ~genre, y= ~sum_of_genre$count, type = "bar" )%>%
layout(title = 'Genres on Netflix',font= 24, plot_bgcolor = "#e5ecf6", xaxis = list(title = 'Genres'), yaxis = list(title = 'Sum of Genres'))