Translate Dataset

library(tidyverse)

url       = 'https://portal.edirepository.org/nis/dataviewer?packageid=edi.5.1&entityid=91615b931c54c3aefedd0b048b22344c'
dir_cache = 'cache'
csv_cache = file.path(dir_cache, 'int-fish_91615b931c54c3aefedd0b048b22344c.csv')
csv_small = 'data/int-fish_1000rows_91615b931c54c3aefedd0b048b22344c.csv'
csv_big   = '~/Dropbox/mbon/sbc_data/int-fish_91615b931c54c3aefedd0b048b22344c.csv'

# fetch
if (!file.exists(csv_cache)){
  d = read_csv(url)
  write_csv(d, csv_cache)
}
d = read_csv(csv_cache) # s = spec_csv(csv_cache)

# translate
d2 = d %>% 
  mutate(
    identificationID     = sprintf('%s_%s_%s_%s_%d', site_id, subsite_id, proj_taxon_id, transect_id, replicate_id),
    Location             = sprintf('%s_%s', site_name, subsite_name),
    decimalLatitude      = latitude,
    decimalLongitude     = longitude,
    eventDate            = date,
    eventRemarks         = data_source,
    samplingProtocol     = sprintf('%s -- %s', sample_method, sample_subtype),
    sampleSizeValue      = ifelse(is.na(height),           area,  area * height),
    sampleSizeUnit       = ifelse(is.na(height), 'square meter', 'cubic meter'),
    ScientificName       = taxon_name,
    taxonID              = auth_taxon_id,
    nameAccordingToID    = auth_name,
    organismQuantity     = count,
    organismQuantityType = "individual") %>%
  select(
    identificationID, 
    Location, decimalLatitude, decimalLongitude, 
    eventDate, eventRemarks,
    samplingProtocol, sampleSizeValue, sampleSizeUnit,
    ScientificName, taxonID, nameAccordingToID,
    organismQuantity, organismQuantityType)

# output
write_csv(d2               , csv_big)
write_csv(d2 %>% head(1000), csv_small)

# preview
d2 %>% head(1000) %>%
  DT::datatable()

Outputs

Issues

Notes:

  • site_name is integer
  • subsite_name has 401,036 NAs of 3,694,040 rows