# Provenance Demo

* https://pypi.org/project/prov/
* https://github.com/cehbrecht/provenance-demo


Python PROV libray:
* https://prov.readthedocs.io/en/latest/index.html
* https://openprovenance.org/store/
* https://trungdong.github.io/prov-python-short-tutorial.html
* https://nbviewer.ipython.org/github/trungdong/notebooks/blob/master/PROV%20Tutorial.ipynb
* https://github.com/trungdong/prov/blob/master/src/prov/tests/examples.py

ESMValTool Provenance:
* https://github.com/ESMValGroup/ESMValCore/blob/master/esmvalcore/_provenance.py
* https://docs.esmvaltool.org/en/latest/community/diagnostic.html?highlight=provenance#recording-provenance
* https://esmvaltool.cloud.dkrz.de/shared/esmvaltool/v2.0.0/

In [None]:
from prov.model import ProvDocument

In [None]:
# Create a new provenance document
doc = ProvDocument()

## Namespaces

In [None]:
# Declaring namespaces for various prefixes used in the example
ROOCS_URI_PREFIX = 'https://roocs.org/'
doc.add_namespace('software', uri=ROOCS_URI_PREFIX + 'software')
doc.add_namespace('project', ROOCS_URI_PREFIX + 'project')  # copernicus, roocs
doc.add_namespace('workflow', ROOCS_URI_PREFIX + 'workflow')  # workflow description
doc.add_namespace('operator', ROOCS_URI_PREFIX + 'operator')  # task, job, calculation, algorithm
doc.add_namespace('parameter', ROOCS_URI_PREFIX + 'parameter') # operator parameter, option
doc.add_namespace('file', ROOCS_URI_PREFIX + 'file')  # netcdf, plots
doc.add_namespace('attribute', ROOCS_URI_PREFIX + 'attribute')  # netcdf attributes, headers variables

## Software

In [None]:
daops = doc.activity('software:daops==v0.2.0')

## Project

In [None]:
project_cds = doc.agent('project:Copernicus Climate Data Store')

## Datasets

In [None]:
attributes = {'attribute:variable': 'tas'}
ds_mpi = doc.entity('file:/data/cmip6/mpi_tas_2000-2010.nc', attributes)
ds_ipsl = doc.entity('file:/data/cmip6/ipsl_tas_2000-2010.nc', attributes)

## Operators

In [None]:
op_subset = doc.activity('operator:subset', other_attributes={'parameter:time': '2005'})
op_diff = doc.activity('operator:diff')

## Workflow

In [None]:
# Create workflow
wf_diff = doc.entity('workflow:diff.json')

# Relate workflow to project
doc.wasAttributedTo(wf_diff, project_cds)

## Run Subset Operator

In [None]:
# subset started by daops
doc.start(op_subset, starter=daops, trigger=wf_diff)

In [None]:
# Generated output file for mpi dataet
output1 = doc.entity('file:mpi_tas_2005.nc', attributes)
doc.wasDerivedFrom(output1, ds_mpi, activity=op_subset)

In [None]:
# Generated output file for ipsl dataset
output2 = doc.entity('file:ipsl_tas_2005.nc', attributes)
doc.wasDerivedFrom(output2, ds_ipsl, activity=op_subset)

## Run Diff Operator

In [None]:
# diff started by daops
doc.start(op_diff, starter=daops, trigger=wf_diff)

In [None]:
# Generated output
output_diff = doc.entity('file:diff_tas_2005.nc', attributes)
doc.wasDerivedFrom(output_diff, output1, activity=op_diff)
doc.wasDerivedFrom(output_diff, output2, activity=op_diff)

## Show Provenance

In [None]:
print(doc.get_provn())