library(tidyverse)
url = 'https://portal.edirepository.org/nis/dataviewer?packageid=edi.5.1&entityid=91615b931c54c3aefedd0b048b22344c'
dir_cache = 'cache'
csv_cache = file.path(dir_cache, 'int-fish_91615b931c54c3aefedd0b048b22344c.csv')
csv_small = 'data/int-fish_1000rows_91615b931c54c3aefedd0b048b22344c.csv'
csv_big = '~/Dropbox/mbon/sbc_data/int-fish_91615b931c54c3aefedd0b048b22344c.csv'
# fetch
if (!file.exists(csv_cache)){
d = read_csv(url)
write_csv(d, csv_cache)
}
d = read_csv(csv_cache) # s = spec_csv(csv_cache)
# translate
d2 = d %>%
mutate(
identificationID = sprintf('%s_%s_%s_%s_%d', site_id, subsite_id, proj_taxon_id, transect_id, replicate_id),
Location = sprintf('%s_%s', site_name, subsite_name),
decimalLatitude = latitude,
decimalLongitude = longitude,
eventDate = date,
eventRemarks = data_source,
samplingProtocol = sprintf('%s -- %s', sample_method, sample_subtype),
sampleSizeValue = ifelse(is.na(height), area, area * height),
sampleSizeUnit = ifelse(is.na(height), 'square meter', 'cubic meter'),
ScientificName = taxon_name,
taxonID = auth_taxon_id,
nameAccordingToID = auth_name,
organismQuantity = count,
organismQuantityType = "individual") %>%
select(
identificationID,
Location, decimalLatitude, decimalLongitude,
eventDate, eventRemarks,
samplingProtocol, sampleSizeValue, sampleSizeUnit,
ScientificName, taxonID, nameAccordingToID,
organismQuantity, organismQuantityType)
# output
write_csv(d2 , csv_big)
write_csv(d2 %>% head(1000), csv_small)
# preview
d2 %>% head(1000) %>%
DT::datatable()
head (first 1,000 rows; 156.611 KB): Github: int-fish_1000rows_91615b931c54c3aefedd0b048b22344c.csv.
full (all 3,694,040 rows; 561.650059 MB): Dropbox: int-fish_91615b931c54c3aefedd0b048b22344c.csv
Notes:
site_name
is integersubsite_name
has 401,036 NAs of 3,694,040 rows