MBON Data Report(s)

Data Reporting on MBON-mediated Datasets
setup
if (!requireNamespace("librarian", quietly = TRUE)) {
  # If not installed, install the package
  install.packages("librarian")
}

librarian::shelf(
  dplyr,
  ggplot2,
  here,
  networkD3,
  tm,
  wordcloud
)
read in data from csv
source(here("R/getCleanedData.R"))
data <- getCleanedData()
create word cloud from titles & descriptions
combined_text <- paste(data$`Dataset.title`, data$`Dataset.summary`, collapse = " ")
corpus <- Corpus(VectorSource(combined_text))

# Clean the text data
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "/")
corpus <- tm_map(corpus, toSpace, "@")
corpus <- tm_map(corpus, toSpace, "\\|")

corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)

# Create a Term-Document Matrix
dtm <- TermDocumentMatrix(corpus)
matrix <- as.matrix(dtm)
word_freqs <- sort(rowSums(matrix), decreasing = TRUE)
df <- data.frame(word = names(word_freqs), freq = word_freqs)

# Generate the word cloud
wordcloud(words = df$word, freq = df$freq, min.freq = 1, 
          max.words = 100, random.order = FALSE, 
          colors = brewer.pal(8, "Dark2"))

create a sankey diagram
source("R/yes_no_values.R")  # load consts used below
data <- data %>%
  mutate(
    RA =     ifelse(tolower(RA)     %in% no_values,   "",          RA),
    erddap = ifelse(tolower(erddap) %in% yes_values,       "in ERDDAP", erddap),
    erddap = ifelse(tolower(erddap) %in% no_values, "",          erddap),
    obis = ifelse(tolower(obis) %in% no_values, "", obis),
    obis = ifelse(tolower(obis) %in% yes_values, "in OBIS", obis),
    ncei = ifelse(tolower(ncei) %in% no_values, "", ncei),
    ncei = ifelse(tolower(ncei) %in% yes_values, "in NCEI", ncei)
  )

# Create links data frames for each column pair and merge them together
links <- rbind(
  data.frame(
    source = data$RA,
    target = data$erddap,
    value = 1
  ), 
  data.frame(
    source = data$RA,
    target = data$obis,
    value = 1
  ),
  data.frame(
    source = data$RA,
    target = data$ncei,
    value = 1
  )
) %>%
  filter_all(all_vars(. != ""))  %>% # drop anything that is empty
  rbind(
    data.frame(
      source = data$mbon.project,
      target = data$RA,
      value = 1
    )
  )
# Create nodes data frame
nodes <- data.frame(
  name = c(as.character(links$source), as.character(links$target)) %>% unique()
)

# Reformat the links with IDs
links$IDsource <- match(links$source, nodes$name) - 1
links$IDtarget <- match(links$target, nodes$name) - 1

# Make the Network
p <- networkD3::sankeyNetwork(Links = links, Nodes = nodes,
                   Source = "IDsource", Target = "IDtarget",
                   Value = "value", NodeID = "name",
                   sinksRight = FALSE)
p