explore_obis

Goal

Explore OBIS occurrence data for ranking species by number of occurrences, spatial/temporal distribution and taxonomic importance (e.g., trophic level, extinction risk). This will facilitate a short list of species for applying initial species distribution models (SDMs).

Source: * Data access | Ocean Biodiversity Information System

Read OBIS parquet

OBIS

# libraries ----
librarian::shelf(
  arrow, dplyr, DT, 
  # BigelowLab/fullobis,
  glue, here, mapview, readr, sf)

  The 'cran_repo' argument in shelf() was not set, so it will use
  cran_repo = 'https://cran.r-project.org' by default.

  To avoid this message, set the 'cran_repo' argument to a CRAN
  mirror URL (see https://cran.r-project.org/mirrors.html) or set
  'quiet = TRUE'.
Warning: package 'arrow' was built under R version 4.3.1
# variables -----
dir_data       <- "/Users/bbest/My Drive/projects/mbon-sdm/data"
obis_prq       <- glue("{dir_data}/raw/obis.org/obis_20230726.parquet")      #  4.9 GB
obis_csv       <- glue("{dir_data}/raw/obis.org/obis_20230726_subcols.csv")  # 16.6 GB
obis_sum_csv   <- here("data/obis_summary.csv")                              # 12.4 MB
obis_topsp_csv <- here("data/obis_top-species.csv")                          # 22   KB

marinebon/sdm-explore: data/

# fetch OBIS parquet
# ok       <- fetch_obis()
# obis_prq <- get_current()

if (!file.exists(obis_csv)) {

  # fullobis
  # fullobis::set_root(glue("{dir_data}/raw/obis.org"))
  # fullobis::set_useragent("http://github.com/bbest")
  
  stopifnot(file.exists(obis_prq))
  # open and read all or parts of the file
  x <- open_dataset(obis_prq)
  
  # x
  # FileSystemDataset with 1 Parquet file
  # id: string
  # dataset_id: string
  # decimalLongitude: float
  # decimalLatitude: float
  # date_start: int64
  # date_mid: int64
  # date_end: int64
  # date_year: int64
  # scientificName: string
  # originalScientificName: string
  # minimumDepthInMeters: float
  # maximumDepthInMeters: float
  # depth: float
  # coordinateUncertaintyInMeters: float
  # flags: string
  # dropped: bool
  # absence: bool
  # shoredistance: string
  # bathymetry: string
  # sst: string
  # sss: string
  # marine: bool
  # brackish: bool
  # freshwater: bool
  # terrestrial: bool
  # taxonRank: string
  # AphiaID: int64
  # redlist_category: string
  # superdomain: string
  # domain: string
  # kingdom: string
  # subkingdom: string
  # infrakingdom: string
  # phylum: string
  # phylum_division: string
  # subphylum_subdivision: string
  # subphylum: string
  # infraphylum: string
  # parvphylum: string
  # gigaclass: string
  # megaclass: string
  # superclass: string
  # class: string
  # subclass: string
  # infraclass: string
  # subterclass: string
  # superorder: string
  # order: string
  # suborder: string
  # infraorder: string
  # parvorder: string
  # superfamily: string
  # family: string
  # subfamily: string
  # supertribe: string
  # tribe: string
  # subtribe: string
  # genus: string
  # subgenus: string
  # section: string
  # subsection: string
  # series: string
  # species: string
  # subspecies: string
  # natio: string
  # variety: string
  # subvariety: string
  # forma: string
  # subforma: string
  # superdomainid: int64
  # domainid: int64
  # kingdomid: int64
  # subkingdomid: int64
  # infrakingdomid: int64
  # phylumid: int64
  # phylum_divisionid: int64
  # subphylum_subdivisionid: int64
  # subphylumid: int64
  # infraphylumid: int64
  # parvphylumid: int64
  # gigaclassid: int64
  # megaclassid: int64
  # superclassid: int64
  # classid: int64
  # subclassid: int64
  # infraclassid: int64
  # subterclassid: int64
  # superorderid: int64
  # orderid: int64
  # suborderid: int64
  # infraorderid: int64
  # parvorderid: int64
  # superfamilyid: int64
  # familyid: int64
  # subfamilyid: int64
  # supertribeid: int64
  # tribeid: int64
  # subtribeid: int64
  # genusid: int64
  # subgenusid: int64
  # sectionid: int64
  # subsectionid: int64
  # seriesid: int64
  # speciesid: int64
  # subspeciesid: int64
  # natioid: int64
  # varietyid: int64
  # subvarietyid: int64
  # formaid: int64
  # subformaid: int64
  # type: string
  # modified: string
  # language: string
  # license: string
  # rightsHolder: string
  # accessRights: string
  # bibliographicCitation: string
  # references: string
  # institutionID: string
  # collectionID: string
  # datasetID: string
  # institutionCode: string
  # collectionCode: string
  # datasetName: string
  # ownerInstitutionCode: string
  # basisOfRecord: string
  # informationWithheld: string
  # dataGeneralizations: string
  # dynamicProperties: string
  # materialSampleID: string
  # occurrenceID: string
  # catalogNumber: string
  # occurrenceRemarks: string
  # recordNumber: string
  # recordedBy: string
  # recordedByID: string
  # individualCount: string
  # organismQuantity: string
  # organismQuantityType: string
  # sex: string
  # lifeStage: string
  # reproductiveCondition: string
  # behavior: string
  # establishmentMeans: string
  # occurrenceStatus: string
  # preparations: string
  # disposition: string
  # otherCatalogNumbers: string
  # associatedMedia: string
  # associatedReferences: string
  # associatedSequences: string
  # associatedTaxa: string
  # organismID: string
  # organismName: string
  # organismScope: string
  # associatedOccurrences: string
  # associatedOrganisms: string
  # previousIdentifications: string
  # organismRemarks: string
  # eventID: string
  # parentEventID: string
  # samplingProtocol: string
  # sampleSizeValue: string
  # sampleSizeUnit: string
  # samplingEffort: string
  # eventDate: string
  # eventTime: string
  # startDayOfYear: string
  # endDayOfYear: string
  # year: string
  # month: string
  # day: string
  # verbatimEventDate: string
  # habitat: string
  # fieldNumber: string
  # fieldNotes: string
  # eventRemarks: string
  # locationID: string
  # higherGeographyID: string
  # higherGeography: string
  # continent: string
  # waterBody: string
  # islandGroup: string
  # island: string
  # country: string
  # countryCode: string
  # stateProvince: string
  # county: string
  # municipality: string
  # locality: string
  # verbatimLocality: string
  # verbatimElevation: string
  # minimumElevationInMeters: string
  # maximumElevationInMeters: string
  # verbatimDepth: string
  # minimumDistanceAboveSurfaceInMeters: string
  # maximumDistanceAboveSurfaceInMeters: string
  # locationAccordingTo: string
  # locationRemarks: string
  # verbatimCoordinates: string
  # verbatimLatitude: string
  # verbatimLongitude: string
  # verbatimCoordinateSystem: string
  # verbatimSRS: string
  # geodeticDatum: string
  # coordinatePrecision: string
  # pointRadiusSpatialFit: string
  # footprintWKT: string
  # footprintSRS: string
  # footprintSpatialFit: string
  # georeferencedBy: string
  # georeferencedDate: string
  # georeferenceProtocol: string
  # georeferenceSources: string
  # georeferenceVerificationStatus: string
  # georeferenceRemarks: string
  # geologicalContextID: string
  # earliestEonOrLowestEonothem: string
  # latestEonOrHighestEonothem: string
  # earliestEraOrLowestErathem: string
  # latestEraOrHighestErathem: string
  # earliestPeriodOrLowestSystem: string
  # latestPeriodOrHighestSystem: string
  # earliestEpochOrLowestSeries: string
  # latestEpochOrHighestSeries: string
  # earliestAgeOrLowestStage: string
  # latestAgeOrHighestStage: string
  # lowestBiostratigraphicZone: string
  # highestBiostratigraphicZone: string
  # lithostratigraphicTerms: string
  # group: string
  # formation: string
  # member: string
  # bed: string
  # identificationID: string
  # identifiedBy: string
  # identifiedByID: string
  # dateIdentified: string
  # identificationReferences: string
  # identificationRemarks: string
  # identificationQualifier: string
  # identificationVerificationStatus: string
  # typeStatus: string
  # taxonID: string
  # scientificNameID: string
  # acceptedNameUsageID: string
  # parentNameUsageID: string
  # originalNameUsageID: string
  # nameAccordingToID: string
  # namePublishedInID: string
  # taxonConceptID: string
  # acceptedNameUsage: string
  # parentNameUsage: string
  # originalNameUsage: string
  # nameAccordingTo: string
  # namePublishedIn: string
  # namePublishedInYear: string
  # higherClassification: string
  # specificEpithet: string
  # infraspecificEpithet: string
  # verbatimTaxonRank: string
  # scientificNameAuthorship: string
  # vernacularName: string
  # nomenclaturalCode: string
  # taxonomicStatus: string
  # nomenclaturalStatus: string
  # taxonRemarks: string
  
  # columns to keep
  col_keep = c(
    "id",
    "phylum",
    "class",
    "taxonRank",
    "scientificName",
    "AphiaID",
    "date_mid",
    "decimalLongitude",
    "decimalLatitude",
    "depth",
    "individualCount",
    "flags")
  system.time({
    # return only observations with valid coordinates and year
    o <- x |> 
      filter(
        !is.na(date_year),
        !is.na(decimalLongitude),
        !is.na(decimalLatitude)) |> 
      select(all_of(col_keep)) |> 
      collect()
  })
  # before extra columns (class, AphiaID, date_mid, eventDate, depth, flags, dropped, absence):
  #      user   system  elapsed 
  #   246.842 1052.350 1777.397   # /60    = 29.6 min
  # after:
  #      user    system   elapsed 
  #   719.556  7455.860 13976.147 # /60/60 = 3.9  hrs
  # after date_mid:
  #  TODO: enter time
  
  # o
  # TODO: paste tibble first 10 rows output with column type
  
  # problems(o)
  #   individualCount: character like "1500 ?", "~40", ">50", "1 or 2"
  #
  # So these fields aren't helpful: absence, dropped
  #   table(y$absence, useNA = "ifany") # <lgl>
  #        FALSE
  #     27101348
  #   table(y$dropped, useNA = "ifany") # <lgl>
  #        FALSE
  #     27101348
  # 
  #   table(y$flags, useNA = "ifany")   # <chr>
  #                                                                             {} 
  #                                                                       14762438 
  #                                                   {DEPTH_EXCEEDS_BATH,ON_LAND} 
  #                                                                         268866 
  #                                                           {DEPTH_EXCEEDS_BATH} 
  #                                                                         298093 
  #                                                  {DEPTH_OUT_OF_RANGE,NO_DEPTH} 
  #                                                                             63 
  #                                     {MARINE_UNSURE,DEPTH_EXCEEDS_BATH,ON_LAND} 
  #                                                                            288 
  #                                             {MARINE_UNSURE,DEPTH_EXCEEDS_BATH} 
  #                                                                            108 
  #                                          {MARINE_UNSURE,MIN_DEPTH_EXCEEDS_MAX} 
  #                                                                              1 
  #                                               {MARINE_UNSURE,NO_DEPTH,ON_LAND} 
  #                                                                           6686 
  #                                                       {MARINE_UNSURE,NO_DEPTH} 
  #                                                                          35799 
  #                                                        {MARINE_UNSURE,ON_LAND} 
  #                                                                            617 
  #                                                                {MARINE_UNSURE} 
  #                                                                          15286 
  #                             {MIN_DEPTH_EXCEEDS_MAX,DEPTH_EXCEEDS_BATH,ON_LAND} 
  #                                                                            189 
  #                                     {MIN_DEPTH_EXCEEDS_MAX,DEPTH_EXCEEDS_BATH} 
  #                                                                           1116 
  #                                                {MIN_DEPTH_EXCEEDS_MAX,ON_LAND} 
  #                                                                            150 
  #                                                        {MIN_DEPTH_EXCEEDS_MAX} 
  #                                                                          17719 
  #                                  {NO_ACCEPTED_NAME,DEPTH_EXCEEDS_BATH,ON_LAND} 
  #                                                                            236 
  #                                          {NO_ACCEPTED_NAME,DEPTH_EXCEEDS_BATH} 
  #                                                                            648 
  #                    {NO_ACCEPTED_NAME,MARINE_UNSURE,DEPTH_EXCEEDS_BATH,ON_LAND} 
  #                                                                              8 
  #                            {NO_ACCEPTED_NAME,MARINE_UNSURE,DEPTH_EXCEEDS_BATH} 
  #                                                                             16 
  #                              {NO_ACCEPTED_NAME,MARINE_UNSURE,NO_DEPTH,ON_LAND} 
  #                                                                             61 
  #                                      {NO_ACCEPTED_NAME,MARINE_UNSURE,NO_DEPTH} 
  #                                                                           1852 
  #                                       {NO_ACCEPTED_NAME,MARINE_UNSURE,ON_LAND} 
  #                                                                             12 
  #                                               {NO_ACCEPTED_NAME,MARINE_UNSURE} 
  #                                                                          13404 
  #            {NO_ACCEPTED_NAME,MIN_DEPTH_EXCEEDS_MAX,DEPTH_EXCEEDS_BATH,ON_LAND} 
  #                                                                              1 
  #                               {NO_ACCEPTED_NAME,MIN_DEPTH_EXCEEDS_MAX,ON_LAND} 
  #                                                                              1 
  #                                       {NO_ACCEPTED_NAME,MIN_DEPTH_EXCEEDS_MAX} 
  #                                                                              2 
  #                                            {NO_ACCEPTED_NAME,NO_DEPTH,ON_LAND} 
  #                                                                            411 
  #                                                    {NO_ACCEPTED_NAME,NO_DEPTH} 
  #                                                                          15646 
  #                                                     {NO_ACCEPTED_NAME,ON_LAND} 
  #                                                                            810 
  #                                                             {NO_ACCEPTED_NAME} 
  #                                                                          84670 
  #                                                             {NO_DEPTH,ON_LAND} 
  #                                                                         826034 
  #                                                                     {NO_DEPTH} 
  #                                                                       10540391 
  #                                                                      {ON_LAND} 
  #                                                                         177481 
  #                         {SCIENTIFICNAMEID_EXTERNAL,DEPTH_EXCEEDS_BATH,ON_LAND} 
  #                                                                          13693 
  #                                 {SCIENTIFICNAMEID_EXTERNAL,DEPTH_EXCEEDS_BATH} 
  #                                                                             58 
  #                     {SCIENTIFICNAMEID_EXTERNAL,NO_ACCEPTED_NAME,MARINE_UNSURE} 
  #                                                                              1 
  #                                   {SCIENTIFICNAMEID_EXTERNAL,NO_DEPTH,ON_LAND} 
  #                                                                              6 
  #                                           {SCIENTIFICNAMEID_EXTERNAL,NO_DEPTH} 
  #                                                                             20 
  #                                            {SCIENTIFICNAMEID_EXTERNAL,ON_LAND} 
  #                                                                              6 
  #                                                    {SCIENTIFICNAMEID_EXTERNAL} 
  #                                                                            251 
  #                       {WORMS_ANNOTATION_RESOLVABLE_AUTHORITY,NO_DEPTH,ON_LAND} 
  #                                                                             43 
  #                                {WORMS_ANNOTATION_RESOLVABLE_AUTHORITY,ON_LAND} 
  #                                                                             94 
  #                                        {WORMS_ANNOTATION_RESOLVABLE_AUTHORITY} 
  #                                                                            214 
  #                          {WORMS_ANNOTATION_RESOLVABLE_LOSS,DEPTH_EXCEEDS_BATH} 
  #                                                                              1 
  #              {WORMS_ANNOTATION_RESOLVABLE_LOSS,NO_ACCEPTED_NAME,MARINE_UNSURE} 
  #                                                                            262 
  #                   {WORMS_ANNOTATION_RESOLVABLE_LOSS,NO_ACCEPTED_NAME,NO_DEPTH} 
  #                                                                             15 
  #                            {WORMS_ANNOTATION_RESOLVABLE_LOSS,NO_ACCEPTED_NAME} 
  #                                                                             24 
  #                            {WORMS_ANNOTATION_RESOLVABLE_LOSS,NO_DEPTH,ON_LAND} 
  #                                                                             46 
  #                                    {WORMS_ANNOTATION_RESOLVABLE_LOSS,NO_DEPTH} 
  #                                                                            231 
  #                                     {WORMS_ANNOTATION_RESOLVABLE_LOSS,ON_LAND} 
  #                                                                              2 
  #                                             {WORMS_ANNOTATION_RESOLVABLE_LOSS} 
  #                                                                           8175 
  #                       {WORMS_ANNOTATION_RESOLVABLE,DEPTH_EXCEEDS_BATH,ON_LAND} 
  #                                                                             23 
  #                               {WORMS_ANNOTATION_RESOLVABLE,DEPTH_EXCEEDS_BATH} 
  #                                                                            133 
  #                                    {WORMS_ANNOTATION_RESOLVABLE,MARINE_UNSURE} 
  #                                                                             73 
  # {WORMS_ANNOTATION_RESOLVABLE,MIN_DEPTH_EXCEEDS_MAX,DEPTH_EXCEEDS_BATH,ON_LAND} 
  #                                                                              1 
  #         {WORMS_ANNOTATION_RESOLVABLE,MIN_DEPTH_EXCEEDS_MAX,DEPTH_EXCEEDS_BATH} 
  #                                                                              4 
  #                    {WORMS_ANNOTATION_RESOLVABLE,MIN_DEPTH_EXCEEDS_MAX,ON_LAND} 
  #                                                                              1 
  #                            {WORMS_ANNOTATION_RESOLVABLE,MIN_DEPTH_EXCEEDS_MAX} 
  #                                                                             10 
  #      {WORMS_ANNOTATION_RESOLVABLE,NO_ACCEPTED_NAME,DEPTH_EXCEEDS_BATH,ON_LAND} 
  #                                                                              1 
  #          {WORMS_ANNOTATION_RESOLVABLE,NO_ACCEPTED_NAME,MARINE_UNSURE,NO_DEPTH} 
  #                                                                              3 
  #                   {WORMS_ANNOTATION_RESOLVABLE,NO_ACCEPTED_NAME,MARINE_UNSURE} 
  #                                                                            326 
  #                         {WORMS_ANNOTATION_RESOLVABLE,NO_ACCEPTED_NAME,ON_LAND} 
  #                                                                              1 
  #                                 {WORMS_ANNOTATION_RESOLVABLE,NO_ACCEPTED_NAME} 
  #                                                                             24 
  #                                 {WORMS_ANNOTATION_RESOLVABLE,NO_DEPTH,ON_LAND} 
  #                                                                            244 
  #                                         {WORMS_ANNOTATION_RESOLVABLE,NO_DEPTH} 
  #                                                                           1373 
  #                                          {WORMS_ANNOTATION_RESOLVABLE,ON_LAND} 
  #                                                                             13 
  #                                                  {WORMS_ANNOTATION_RESOLVABLE} 
  #                                                                           6873
  write_csv(o, obis_csv)
  rm(x, o)
}

if (!file.exists(obis_sum_csv)){

  o <- read_csv(obis_csv)
  
  o_sum <- o |> 
    # TODO: get Kingdom
    group_by(taxonRank, phylum, class, scientificName, AphiaID) |>
    summarize(
      date_min = min(date_mid),
      date_max = max(date_mid),
      n_obs    = n(),
      n_indiv  = sum(as.numeric(individualCount), na.rm=T),
      .groups = "drop") |> 
    mutate(
      date_min = as.POSIXct(
        date_min/1000, origin = "1970-01-01",tz = "GMT") |> as.Date(),
      date_max = as.POSIXct(
        date_max/1000, origin = "1970-01-01",tz = "GMT") |> as.Date()) |> 
    arrange(taxonRank, phylum, class, desc(n_obs), scientificName, AphiaID)
  o_sum
  
  n_distinct(o_sum$phylum)
  # 61
  
  n_distinct(o_sum$class)
  # 174
  
  dim(o_sum)
  # 89216     9

  write_csv(o_sum, obis_sum_csv)
  rm(x, o, o_sum)
}

o_sum <- read_csv(obis_sum_csv)
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 147069 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (4): taxonRank, phylum, class, scientificName
dbl  (3): AphiaID, n_obs, n_indiv
date (2): date_min, date_max

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
o_sum |> 
  group_by(taxonRank, phylum) |>
  summarize(
    n = n()) |> 
  datatable()
`summarise()` has grouped output by 'taxonRank'. You can override using the
`.groups` argument.
o_topsp <- o_sum |>
  filter(taxonRank == "Species") |> 
  arrange(phylum, class, desc(n_obs)) |>
  group_by(phylum, class) |>
  summarize(
    scientificName = first(scientificName), 
    AphiaID        = first(AphiaID), 
    date_min       = first(date_min), 
    date_max       = first(date_max), 
    n_obs          = first(n_obs), 
    n_indiv        = first(n_indiv), 
    .groups = "drop")
dim(o_topsp)
[1] 261   8
# 233   9
write_csv(o_topsp, obis_topsp_csv)

datatable(o_topsp)