-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraping-informations.R
More file actions
114 lines (85 loc) · 3.14 KB
/
Copy pathscraping-informations.R
File metadata and controls
114 lines (85 loc) · 3.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
library(rvest)
install.packages("stringr")
library(stringr)
install.packages("lubridate")
library(lubridate)
install.packages("tidyverse")
library(tidyverse)
cran_link <- function(...) {
file.path("https://cran.rstudio.com/src/contrib", ...)
}
pkgs_raw <- read_html(cran_link()) %>%
html_nodes("table") %>%
.[[1]] %>%
html_table()
pkgs_raw <- pkgs_raw[,-1]
pkgs <- pkgs_raw %>%
filter(Size != "-",
str_detect(Name, "tar.gz$")) %>%
mutate(Date = dmy_hm(`Last modified`),
Name = str_extract(Name, "^[^_]+(?=_)")) %>%
select(-Size, -Description) %>%
as_tibble()
pkgs
#------------------------------------------------------------------
archives_raw <- read_html(cran_link("Archive")) %>%
html_nodes("table") %>%
.[[1]] %>%
html_table()
archives_raw <- archives_raw[,-1]
archives_processed <- archives_raw %>%
filter(str_detect(Name, "/$")) %>%
mutate(Date = dmy_hm(`Last modified`),
Name = str_sub(Name, end = -2)) %>%
select(-Size, -Description) %>%
as_tibble()
#--------------------------------------------
read_page <- function(name) {
message(name)
read_html(cran_link("Archive", name)) %>%
html_nodes("td") %>%
html_text()
}
archives_scraped <- archives_processed %>%
mutate(page = map(Name, read_page))
archives <- archives_scraped %>%
mutate(Date = dmy_hm(map_chr(page, ~ .[8])),
ArchivedVersions = map_dbl(page, ~ length(.) / 5 - 1)) %>%
select(-page)
all_pkgs <- bind_rows(archives %>%
anti_join(pkgs, by = "Name") %>%
mutate(Archived = TRUE),
pkgs %>%
anti_join(archives, by = "Name") %>%
mutate(ArchivedVersions = 0,
Archived = FALSE),
archives %>%
semi_join(pkgs, by = "Name") %>%
mutate(Archived = FALSE)) %>%
mutate(Versions = ifelse(Archived, ArchivedVersions, ArchivedVersions + 1)) %>%
arrange(Name)
all_pkgs
all_pkgs %>%
filter(!Archived) %>%
group_by(Date = floor_date(Date, unit = "month")) %>%
summarise(NewPackages = n()) %>%
ungroup %>%
mutate(TotalPackages = cumsum(NewPackages)) %>%
ggplot(aes(Date, TotalPackages)) +
geom_line(size = 1.5, alpha = 0.8, color = "midnightblue") +
labs(x = NULL, y = "Number of available packages",
title = "How many packages are available on CRAN?",
subtitle = "Only packages that are still available")
all_pkgs %>%
ggplot(aes(Archived)) +
geom_histogram(stat = "count", alpha = 0.8, fill = "midnightblue") +
scale_x_discrete(labels=c("Still available", "Archived, no longer available")) +
labs(y = "Number of packages", x = NULL,
title = "How many packages are no longer available on CRAN?",
subtitle = "About 10% of total packages are no longer available")
all_pkgs %>%
ggplot(aes(Versions)) +
geom_histogram(binwidth = 10, alpha = 0.8, fill = "midnightblue") +
labs(y = "Number of packages", x = "Number of versions on CRAN",
title = "How many versions do CRAN packages have?",
subtitle = "About 25% of packages are on their first version")