setup
if (!requireNamespace("librarian", quietly = TRUE)) {
# If not installed, install the package
install.packages("librarian")
}
::shelf(
librarian
dplyr,
ggplot2,
here,
networkD3,
tm,
wordcloud )
combined_text <- paste(data$`Dataset.title`, data$`Dataset.summary`, collapse = " ")
corpus <- Corpus(VectorSource(combined_text))
# Clean the text data
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "/")
corpus <- tm_map(corpus, toSpace, "@")
corpus <- tm_map(corpus, toSpace, "\\|")
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)
# Create a Term-Document Matrix
dtm <- TermDocumentMatrix(corpus)
matrix <- as.matrix(dtm)
word_freqs <- sort(rowSums(matrix), decreasing = TRUE)
df <- data.frame(word = names(word_freqs), freq = word_freqs)
# Generate the word cloud
wordcloud(words = df$word, freq = df$freq, min.freq = 1,
max.words = 100, random.order = FALSE,
colors = brewer.pal(8, "Dark2"))
source("R/yes_no_values.R") # load consts used below
data <- data %>%
mutate(
RA = ifelse(tolower(RA) %in% no_values, "", RA),
erddap = ifelse(tolower(erddap) %in% yes_values, "in ERDDAP", erddap),
erddap = ifelse(tolower(erddap) %in% no_values, "", erddap),
obis = ifelse(tolower(obis) %in% no_values, "", obis),
obis = ifelse(tolower(obis) %in% yes_values, "in OBIS", obis),
ncei = ifelse(tolower(ncei) %in% no_values, "", ncei),
ncei = ifelse(tolower(ncei) %in% yes_values, "in NCEI", ncei)
)
# Create links data frames for each column pair and merge them together
links <- rbind(
data.frame(
source = data$RA,
target = data$erddap,
value = 1
),
data.frame(
source = data$RA,
target = data$obis,
value = 1
),
data.frame(
source = data$RA,
target = data$ncei,
value = 1
)
) %>%
filter_all(all_vars(. != "")) %>% # drop anything that is empty
rbind(
data.frame(
source = data$mbon.project,
target = data$RA,
value = 1
)
)
# Create nodes data frame
nodes <- data.frame(
name = c(as.character(links$source), as.character(links$target)) %>% unique()
)
# Reformat the links with IDs
links$IDsource <- match(links$source, nodes$name) - 1
links$IDtarget <- match(links$target, nodes$name) - 1
# Make the Network
p <- networkD3::sankeyNetwork(Links = links, Nodes = nodes,
Source = "IDsource", Target = "IDtarget",
Value = "value", NodeID = "name",
sinksRight = FALSE)
p