# install.packages("tidyverse")
# install.packages("rvest")
# install.packages("stringr")
# install.packages("ggplot2")
imdb_assignment
Packages
Firstly, we have to install our packages.
Secondly, we need to active these packages.
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.4.4 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rvest)
Attaching package: 'rvest'
The following object is masked from 'package:readr':
guess_encoding
library(stringr)
library(ggplot2)
Datas
Links which we will pull data
Assigning links from which we will pull data
<- "https://m.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2023-12-25&num_votes=2499,&country_of_origin=TR&count=250"
link1 <- "https://m.imdb.com/search/title/?title_type=feature&release_date=,2009-12-31&num_votes=2499,&country_of_origin=TR&count=250"
link2
<- read_html(link1)
page1 <- read_html(link2) page2
Web Wrangling
We must obtain a “data frame” by parsing the characteristic features of the films. Thus, we can perform our exploratory data analysis with the data set we created.
(Since we have two separate links, after performing separate operations for both of them, we put them into a single vector with the c() function.)
For Titles :
<- page1 %>% html_nodes(".dli-title") %>% html_text()
titles1 <- page2 %>% html_nodes(".dli-title") %>% html_text()
titles2
<- c(titles1, titles2)
combined_titles # print(combined_titles)
For Years :
<- page1 %>% html_nodes(".dli-title-metadata-item:nth-child(1)") %>% html_text()
years1 <- page2 %>% html_nodes(".dli-title-metadata-item:nth-child(1)") %>% html_text()
years2
<- c(years1, years2)
combined_years # print(combined_years)
For Durations :
<- page1 %>% html_nodes(".dli-title-metadata-item:nth-child(2)") %>% html_text()
durations1 <- page2 %>% html_nodes(".dli-title-metadata-item:nth-child(2)") %>% html_text()
durations2
<- c(durations1, durations2)
combined_durations # print(combined_durations)
For Ratings :
<- page1 %>% html_nodes(".ratingGroup--imdb-rating") %>% html_text()
ratings1 <- page2 %>% html_nodes(".ratingGroup--imdb-rating") %>% html_text()
ratings2
<- c(ratings1, ratings2)
combined_ratings # print(combined_ratings)
For Votes :
<- page1 %>% html_nodes(".ipc-rating-star--voteCount") %>% html_text()
votes1 <- page2 %>% html_nodes(".ipc-rating-star--voteCount") %>% html_text()
votes2
<- c(votes1, votes2)
combined_votes # print(combined_votes)
String Processing
We pulled our data from the web. Now we need to make our data suitable for the data frame with string processing.
For Titles :
<- page1 %>% html_nodes('.ipc-title__text')
title_names1 <- html_text(title_names1)
title_names1 <- tail(head(title_names1,-1),-1)
title_names1 <- str_split(title_names1, " ", n=2)
title_names1 <- unlist(lapply(title_names1, function(x) {x[2]}))
title_names1
<- page2 %>% html_nodes('.ipc-title__text')
title_names2 <- html_text(title_names2)
title_names2 <- tail(head(title_names2,-1),-1)
title_names2 <- str_split(title_names2, " ", n=2)
title_names2 <- unlist(lapply(title_names2, function(x) {x[2]}))
title_names2
<- c(title_names1, title_names2)
combined_title_names # print(combined_title_names)
For Years :
Since the year data is available, we do not need to do any processing on them.
For Durations :
# For Durations - 1
# Saat ve daikaları ayırıp tanımlıyorum :
<- as.integer(str_extract(durations1, "\\d+(?=h)")) # For Hours
hours1 <- as.integer(str_extract(durations1, "\\d+(?=m)")) # For Minutes
minutes1
is.na(hours1)] <- as.integer("0") # sadece dakika olan var ise
hours1[is.na(minutes1)] <- as.integer("0") # sadece saat olan var ise
minutes1[
# Süreleri dakika cinsine dönüştürmek için :
<- (hours1 * 60) + minutes1
total_duration_minutes1
# For Durations - 2
<- as.integer(str_extract(durations2, "\\d+(?=h)"))
hours2 <- as.integer(str_extract(durations2, "\\d+(?=m)"))
minutes2
is.na(hours2)] <- as.integer("0")
hours2[is.na(minutes2)] <- as.integer("0")
minutes2[
<- (hours2 * 60) + minutes2
total_duration_minutes2
<- c(total_duration_minutes1 , total_duration_minutes2)
combined_durations # print(combined_durations)
For Votes :
<- page1 %>% html_nodes(".ipc-rating-star--voteCount") %>% html_text()
votes1 <- page2 %>% html_nodes(".ipc-rating-star--voteCount") %>% html_text()
votes2
# From ChatGPT :
<- str_replace_all(votes1, "[()]", "")
votes1 <- str_replace_all(votes2, "[()]", "")
votes2
# From ChatGPT :
<- str_remove_all(votes1, "[^0-9.K]")
votes1 <- str_remove_all(votes2, "[^0-9.K]")
votes2
<- str_remove_all(votes1, "K") %>% as.numeric() * 1000
votes_numeric1 <- str_remove_all(votes2, "K") %>% as.numeric() * 1000
votes_numeric2
<- c(votes_numeric1 , votes_numeric2)
combined_votes # print(combined_votes)
For Ratings :
# Parantezleri tamamen kaldırma
<- str_replace_all(combined_ratings, "\\(.*\\)", "")
rating_no_parant1 <- str_trim(str_extract(rating_no_parant1, "^\\d+\\.?\\d*"))
rating_no_parant_no_space
# Simdi sayıları double tipine çevirelim :
<- as.double(rating_no_parant_no_space)
combined_ratings
class(combined_ratings)
[1] "numeric"
# print(combined_ratings)
Creating a Data Frame :
We have completed String Processing. All the data we have obtained now is of a simple and appropriate type suitable for adding to the data frame.
<- data.frame(Title = combined_title_names, Year = combined_years, Duration = combined_durations, Rate = combined_ratings, Vote = combined_votes) imdb_movies_data
Upgrade Data Frame
Let’s sort our data set in decreasing order of vote rate.
<- imdb_movies_data %>%
imdb_movies_data_rate_decreasing arrange(desc(Rate))
Let’s add a column to our data set showing the average rate by year.
<- imdb_movies_data_rate_decreasing %>%
updated_imdb_movies_data group_by(Year) %>%
mutate(Mean_Rate = mean(Rate))
Let’s add a column to our data set showing its rank according to vote rate s:
<- imdb_movies_data_rate_decreasing %>%
updated_imdb_movies_data mutate(Order = row_number())
Let’s examine the differences :
head(imdb_movies_data)
Title Year Duration Rate Vote
1 Kuru Otlar Üstüne 2023 197 8.0 7700
2 Yedinci Kogustaki Mucize 2019 132 8.2 55000
3 Istanbul Için Son Çagri 2023 91 5.3 8400
4 Kis Uykusu 2014 196 8.0 55000
5 Baskin 2015 97 5.8 12000
6 Bihter 2023 113 3.6 3600
head(updated_imdb_movies_data)
Title Year Duration Rate Vote Order
1 Hababam Sinifi 1975 87 9.2 43000 1
2 CM101MMXI Fundamentals 2013 139 9.1 47000 2
3 Tosun Pasa 1976 90 8.9 24000 3
4 Hababam Sinifi Sinifta Kaldi 1975 95 8.9 25000 4
5 Süt Kardesler 1976 80 8.8 21000 5
6 Eflâtun 2022 103 8.7 3000 6
Adding Mean Rate :
<- imdb_movies_data %>%
updated_imdb_movies_data group_by(Year) %>%
mutate(Mean_Rate = mean(Rate), Total_Film_Numbers = n())
Extras
Let’s find the 5 movies with the highest and lowest scores based on IMDb scores.
<- imdb_movies_data %>%
imdb_movies_data_rate_decreasing arrange(desc(Rate))
# En iyi ve en kotu 5 film :
print("En iyi oy oranına sahip 5 film:")
[1] "En iyi oy oranına sahip 5 film:"
print(head(imdb_movies_data_rate_decreasing, 5))
Title Year Duration Rate Vote
1 Hababam Sinifi 1975 87 9.2 43000
2 CM101MMXI Fundamentals 2013 139 9.1 47000
3 Tosun Pasa 1976 90 8.9 24000
4 Hababam Sinifi Sinifta Kaldi 1975 95 8.9 25000
5 Süt Kardesler 1976 80 8.8 21000
# En alttaki 5 satırı yazdır
print("En kotu oy oranına sahip 5 film:")
[1] "En kotu oy oranına sahip 5 film:"
print(tail(imdb_movies_data_rate_decreasing, 5))
Title Year Duration Rate Vote
470 Cumali Ceber 2 2018 100 1.2 10000
471 Müjde 2022 48 1.2 9900
472 15/07 Safak Vakti 2021 95 1.2 21000
473 Reis 2017 108 1.0 74000
474 Cumali Ceber: Allah Seni Alsin 2017 100 1.0 39000
Let’s examine the imdb data of 4 movies I like.
<- function(name_of_the_film) {
search_film <- imdb_movies_data_rate_decreasing[imdb_movies_data_rate_decreasing$Title == name_of_the_film, ]
result return(result)
}
search_film("Senden Bana Kalan")
Title Year Duration Rate Vote
264 Senden Bana Kalan 2015 117 6.5 3400
search_film("Aykut Eniste")
Title Year Duration Rate Vote
194 Aykut Eniste 2019 115 7 7500
search_film("Kolonya Cumhuriyeti")
Title Year Duration Rate Vote
353 Kolonya Cumhuriyeti 2017 110 5.8 5500
search_film("Recep Ivedik 2")
Title Year Duration Rate Vote
421 Recep Ivedik 2 2009 107 4.5 23000
PLOTS
MEAN RATE
<- updated_imdb_movies_data %>%
plot_mean_rate_per_year_col ggplot(aes(x = Year, y = Mean_Rate)) +
geom_col()
print(plot_mean_rate_per_year_col)
<- updated_imdb_movies_data %>%
plot_mean_rate_per_year_point ggplot(aes(x = Year, y = Mean_Rate)) +
geom_point()
print(plot_mean_rate_per_year_point)
RATE
<- updated_imdb_movies_data %>%
plot_rate_per_year_box ggplot(aes(x = Year, y = Rate)) +
geom_boxplot()
print(plot_rate_per_year_box)
FILM NUMBER
<- updated_imdb_movies_data %>%
plot_total_film_number_per_year_point ggplot(aes(x = Year, y = Total_Film_Numbers)) +
geom_point()
print(plot_total_film_number_per_year_point)