Commit 5597b0b8 authored by BorjaEst's avatar BorjaEst
Browse files

Merge branch '7-add-metadata-file-to-output' into dev

parents 2530da17 cf003d33
...@@ -49,14 +49,46 @@ argument is configured at the :doc:`../getting_started/cli` call. ...@@ -49,14 +49,46 @@ argument is configured at the :doc:`../getting_started/cli` call.
# [CUSTOMIZABLE_KEY -- MANDATORY] # [CUSTOMIZABLE_KEY -- MANDATORY]
CCMI-1: CCMI-1:
# Source metadata; common to all models in this source
# [FIXED_KEY -- OPTIONAL]
metadata:
# A metadata information example related to the source
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_0: Source metadata string example
# A metadata information example related to the source
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_1: Source metadata example replaced later by model
# This is the preceded -y1- string at the output folder: '[x1]_[y1]' # This is the preceded -y1- string at the output folder: '[x1]_[y1]'
# [CUSTOMIZABLE_KEY -- MANDATORY] # [CUSTOMIZABLE_KEY -- MANDATORY]
IPSL: IPSL:
# Model metadata; Unique key values for this model
# [FIXED_KEY -- OPTIONAL]
metadata:
# A metadata information example related to the source
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_1: Replaces the metadata from the source
# A metadata information example related to the source
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_2: Model metadata string example
# Represents the information related to tco3 data # Represents the information related to tco3 data
# [FIXED_KEY -- OPTIONAL] # [FIXED_KEY -- OPTIONAL]
tco3_zm: tco3_zm:
# TCO3 metadata; metadata for variable tco3
# [FIXED_KEY -- OPTIONAL]
metadata:
# A tco3 metadata attribute example
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_0: Structured as tco3_zm: -> meta_0:
# Variable name for tco3 array inside the dataset # Variable name for tco3 array inside the dataset
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
name: toz name: toz
...@@ -82,6 +114,14 @@ argument is configured at the :doc:`../getting_started/cli` call. ...@@ -82,6 +114,14 @@ argument is configured at the :doc:`../getting_started/cli` call.
# [FIXED_KEY -- OPTIONAL] # [FIXED_KEY -- OPTIONAL]
vmro3_zm: vmro3_zm:
# VMRO3 metadata; metadata for variable vmro3
# [FIXED_KEY -- OPTIONAL]
metadata:
# A vmro3 metadata attribute example
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_0: Structured as vmro3_zm: -> meta_0:
# Variable name for vmro3 array inside the dataset # Variable name for vmro3 array inside the dataset
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
name: vmro3 name: vmro3
...@@ -92,8 +132,8 @@ argument is configured at the :doc:`../getting_started/cli` call. ...@@ -92,8 +132,8 @@ argument is configured at the :doc:`../getting_started/cli` call.
# Coordinates description for vmro3 data. # Coordinates description for vmro3 data.
# [FIXED_KEY -- MANDATORY]: # [FIXED_KEY -- MANDATORY]:
coordinates: coordinates:
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
time: time time: time
...@@ -129,95 +169,155 @@ key *path* should be the same for both variables. The output expected at ...@@ -129,95 +169,155 @@ key *path* should be the same for both variables. The output expected at
.. code-block:: yaml .. code-block:: yaml
# This is the preceded -x2- string at the output folder: '[x2]_[y-]' # This is the preceded -x2- string at the output folder: '[x2]_[y-]'
# [CUSTOMIZABLE_KEY -- MANDATORY] # [CUSTOMIZABLE_KEY -- MANDATORY]
ECMWF: ECMWF:
# This is the preceded -y1- string at the output folder: '[x2]_[y1]' # Source metadata; common to all models in this source
# [CUSTOMIZABLE_KEY -- MANDATORY] # [FIXED_KEY -- OPTIONAL]
ERA-5: metadata:
# Represents the information related to tco3 data # A metadata information example related to the source
# [FIXED_KEY -- OPTIONAL] # [CUSTOMIZABLE_KEY -- OPTIONAL]
tco3_zm: meta_0: Source metadata string example
# Variable name for tco3 array inside the dataset # A metadata information example related to the source
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # [CUSTOMIZABLE_KEY -- OPTIONAL]
name: tco3 meta_1: Source metadata example replaced later by model
# Reg expression, how to load the netCDF files # This is the preceded -y1- string at the output folder: '[x2]_[y1]'
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # [CUSTOMIZABLE_KEY -- MANDATORY]
paths: Ecmwf/Era5 ERA-5:
# Coordinates description for tco3 data. # Model metadata; Unique key values for this model
# [FIXED_KEY -- MANDATORY]: # [FIXED_KEY -- OPTIONAL]
coordinates: metadata:
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # A metadata information example related to the source
lat: latitude # [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_1: Replaces the metadata from the source
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # A metadata information example related to the source
lon: longitude # [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_2: Model metadata string example
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # Represents the information related to tco3 data
time: time # [FIXED_KEY -- OPTIONAL]
tco3_zm:
# This is the preceded -y2- string at the output folder: '[x2]_[y2]' # TCO3 metadata; metadata for variable tco3
# [CUSTOMIZABLE_KEY -- MANDATORY] # [FIXED_KEY -- OPTIONAL]
ERA-i: metadata:
# Represents the information related to tco3 data # A tco3 metadata attribute example
# [FIXED_KEY -- OPTIONAL] # [CUSTOMIZABLE_KEY -- OPTIONAL]
tco3_zm: meta_0: Structured as tco3_zm: -> meta_0:
# Variable name for tco3 array inside the dataset # Variable name for tco3 array inside the dataset
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
name: toz name: tco3
# Reg expression, how to load the netCDF files # Reg expression, how to load the netCDF files
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
paths: Ecmwf/Erai paths: Ecmwf/Era5
# Coordinates description for tco3 data. # Coordinates description for tco3 data.
# [FIXED_KEY -- MANDATORY]: # [FIXED_KEY -- MANDATORY]:
coordinates: coordinates:
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
time: time lat: latitude
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
lat: latitude lon: longitude
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
lon: longitude time: time
# Represents the information related to vmro3 data # This is the preceded -y2- string at the output folder: '[x2]_[y2]'
# [FIXED_KEY -- OPTIONAL] # [CUSTOMIZABLE_KEY -- MANDATORY]
vmro3_zm: ERA-i:
# Variable name for vmro3 array inside the dataset # Model metadata; Unique key values for this model
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # [FIXED_KEY -- OPTIONAL]
name: vmro3 metadata:
# Reg expression, how to load the netCDF files # A metadata information example related to the source
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # [CUSTOMIZABLE_KEY -- OPTIONAL]
paths: Ecmwf/Erai meta_1: Replaces the metadata from the source
# Coordinates description for vmro3 data. # A metadata information example related to the source
# [FIXED_KEY -- MANDATORY]: # [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_2: Model metadata string example
coordinates: # Represents the information related to tco3 data
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # [FIXED_KEY -- OPTIONAL]
time: time tco3_zm:
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # TCO3 metadata; metadata for variable tco3
plev: level # [FIXED_KEY -- OPTIONAL]
metadata:
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # A tco3 metadata attribute example
lat: latitude # [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_0: Structured as tco3_zm: -> meta_0:
# Variable name for tco3 array inside the dataset
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
name: toz
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY] # Reg expression, how to load the netCDF files
lon: longitude # [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
paths: Ecmwf/Erai
# Coordinates description for tco3 data.
# [FIXED_KEY -- MANDATORY]:
coordinates:
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
time: time
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
lat: latitude
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
lon: longitude
# Represents the information related to vmro3 data
# [FIXED_KEY -- OPTIONAL]
vmro3_zm:
# VMRO3 metadata; metadata for variable vmro3
# [FIXED_KEY -- OPTIONAL]
metadata:
# A vmro3 metadata attribute example
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_0: Structured as vmro3_zm: -> meta_0:
# Variable name for vmro3 array inside the dataset
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
name: vmro3
# Reg expression, how to load the netCDF files
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
paths: Ecmwf/Erai
# Coordinates description for vmro3 data.
# [FIXED_KEY -- MANDATORY]:
coordinates:
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
time: time
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
plev: level
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
lat: latitude
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
lon: longitude
One or two files? One or two files?
----------------- -----------------
......
...@@ -41,7 +41,7 @@ if __name__ == '__main__': ...@@ -41,7 +41,7 @@ if __name__ == '__main__':
# Create sources # Create sources
logging.info("Loading data from './data' ") logging.info("Loading data from './data' ")
with o3skim.cd("data"): with o3skim.cd("data"):
ds = {name: o3skim.Source(name, collection) for ds = {name: o3skim.Source(name, **collection) for
name, collection in config.items()} name, collection in config.items()}
# Skim output # Skim output
......
...@@ -36,6 +36,14 @@ class ModelAccessor: ...@@ -36,6 +36,14 @@ class ModelAccessor:
else: else:
return None return None
@property
def metadata(self):
"""Return the ozone volume mixing ratio of this dataset."""
result = self._model.attrs
for var in self._model.var():
result = {**result, var: self._model[var].attrs}
return result
def groupby_year(self): def groupby_year(self):
"""Returns a grouped dataset by year""" """Returns a grouped dataset by year"""
logger.debug("Performing group by year on model") logger.debug("Performing group by year on model")
...@@ -115,6 +123,14 @@ class Tests(unittest.TestCase): ...@@ -115,6 +123,14 @@ class Tests(unittest.TestCase):
expected = Tests.vmro3_datarray().to_dataset(name="vmro3_zm") expected = Tests.vmro3_datarray().to_dataset(name="vmro3_zm")
xr.testing.assert_equal(self.ds.model.vmro3, expected) xr.testing.assert_equal(self.ds.model.vmro3, expected)
def test_metadata_property(self):
metadata = self.ds.model.metadata
self.assertEqual(metadata["description"], "Test dataset")
self.assertEqual(metadata["tco3_zm"]
["description"], "Test tco3 datarray")
self.assertEqual(metadata["vmro3_zm"]
["description"], "Test vmro3 datarray")
def test_groupby_year(self): def test_groupby_year(self):
groups = self.ds.model.groupby_year() groups = self.ds.model.groupby_year()
self.assertEqual(25, len(groups)) self.assertEqual(25, len(groups))
......
...@@ -26,27 +26,30 @@ logger = logging.getLogger('source') ...@@ -26,27 +26,30 @@ logger = logging.getLogger('source')
class Source: class Source:
"""Conceptual class for a data source. It is produced by the loading r"""Conceptual class for a data source. It is produced by the
and standardization of multiple data models. loading and standardization of multiple data models.
The current supported model variables are "tco3_zm" and "vmro3_zm", The current supported model variables are "tco3_zm" and "vmro3_zm",
which should contain the information on how to retrieve the data which should contain the information on how to retrieve the data
from the netCDF collection. from the netCDF collection.
:param name: Name to provide to the source. :param name: Name to provide to the source.
:type name: str :type name: str
:param collections: Dictionary where each 'key' is a model name :param metadata: Source metadata, defaults to {}.
and its value another dictionary with the variable loading :type metadata: dict, optional
statements for that model.
{name:str, paths: str, coordinates: dict} :param \**collections: kwarg where each 'key' is the model
:type collections: dict name and its 'value' another dictionary with the variable
loading statements for that model.
{name:str, paths: str, coordinates: dict, metadata: dict}
""" """
def __init__(self, name, collections): def __init__(self, name, metadata={}, **collections):
self.name = name self._name = name
self._metadata = metadata
self._models = {} self._models = {}
logger.info("Loading source '%s'", self.name) logger.info("Loading source '%s'", name)
for name, specifications in collections.items(): for name, specifications in collections.items():
logger.info("Loading model '%s'", name) logger.info("Loading model '%s'", name)
model = _load_model(**specifications) model = _load_model(**specifications)
...@@ -56,33 +59,63 @@ class Source: ...@@ -56,33 +59,63 @@ class Source:
def __getitem__(self, model_name): def __getitem__(self, model_name):
return self._models[model_name] return self._models[model_name]
@property
def name(self):
return self._name
@property @property
def models(self): def models(self):
return list(self._models.keys()) return list(self._models.keys())
@property
def metadata(self):
return self._metadata
def skim(self, groupby=None): def skim(self, groupby=None):
"""Request to skim all source data into the current folder. """Request to skim all source data into the current folder.
The output is generated into multiple folder where The output is generated into multiple folder where
each model output is generated in a forder with the source each model output is generated in a forder with the source
name defined at the source initialization followed by name defined at the source initialization followed by
'_' and the model name: "<source_name>_<model_name>" '_' and the model name: "<source_name>_<model_name>".
If there was metadata added when creating the source, it is
delivered into a "metadata.yaml" file on the directory.
:param groupby: How to group output (None, 'year', 'decade'). :param groupby: How to group output (None, 'year', 'decade').
:type groupby: str, optional :type groupby: str, optional
""" """
for model in self._models: for model in self._models:
dirname = "{source}_{model}".format(source=self.name, model=model) dirname = "{}_{}".format(self._name, model)
os.makedirs(dirname, exist_ok=True) os.makedirs(dirname, exist_ok=True)
logger.info("Skimming data from '%s'", dirname) logger.info("Skimming data from '%s'", dirname)
with utils.cd(dirname): with utils.cd(dirname):
_skim(self[model], delta=groupby) source_metadata = self.metadata
model_metadata = self[model].model.metadata
metadata = {**source_metadata, **model_metadata}
_skim(self[model], delta=groupby, metadata=metadata)
@utils.return_on_failure("Error when loading model", default=None) @utils.return_on_failure("Error when loading model", default=None)
def _load_model(tco3_zm=None, vmro3_zm=None): def _load_model(tco3_zm=None, vmro3_zm=None, metadata={}):
"""Loads a model merging standardized data from specified datasets.""" """Loads a model merging standardized data from specified datasets.
dataset = xr.Dataset()
:param tco3_zm: tco3 variable description, defaults to None.
:type tco3_zm: {name:str, paths:str,
coordinates:{lat:str, lon:str, time:str}},
optional
:param vmro3_zm: vmro3 variable description, defaults to None.
:type vmro3_zm: {name:str, paths:str,
coordinates:{lat:str, lon:str, plev:str time:str}},
optional
:param metadata: Source metadata, defaults to {}.
:type metadata: dict, optional
:return: Dataset with specified variables.
:rtype: xarray.Dataset
"""
dataset = xr.Dataset(attrs=metadata)
if tco3_zm: if tco3_zm:
logger.debug("Loading tco3_zm into model") logger.debug("Loading tco3_zm into model")
with xr.open_mfdataset(tco3_zm['paths']) as load: with xr.open_mfdataset(tco3_zm['paths']) as load:
...@@ -91,6 +124,7 @@ def _load_model(tco3_zm=None, vmro3_zm=None): ...@@ -91,6 +124,7 @@ def _load_model(tco3_zm=None, vmro3_zm=None):
variable=tco3_zm['name'], variable=tco3_zm['name'],
coordinates=tco3_zm['coordinates']) coordinates=tco3_zm['coordinates'])
dataset = dataset.merge(standardized) dataset = dataset.merge(standardized)
dataset.tco3_zm.attrs = tco3_zm.get('metadata', {})
if vmro3_zm: if vmro3_zm:
logger.debug("Loading vmro3_zm into model") logger.debug("Loading vmro3_zm into model")
with xr.open_mfdataset(vmro3_zm['paths']) as load: with xr.open_mfdataset(vmro3_zm['paths']) as load:
...@@ -99,11 +133,25 @@ def _load_model(tco3_zm=None, vmro3_zm=None): ...@@ -99,11 +133,25 @@ def _load_model(tco3_zm=None, vmro3_zm=None):
variable=vmro3_zm['name'], variable=vmro3_zm['name'],
coordinates=vmro3_zm['coordinates']) coordinates=vmro3_zm['coordinates'])
dataset = dataset.merge(standardized) dataset = dataset.merge(standardized)
dataset.vmro3_zm.attrs = vmro3_zm.get('metadata', {})
return dataset return dataset
def _skim(model, delta=None): def _skim(model, delta=None, metadata=None):
"""Skims model producing reduced dataset files""" """Skims model producing reduced dataset files. It is possible to
indicate the time to split the output by 'delta'. If metadata is
introduced in the form of dict, a 'metadata.yaml' file is
generated together with the skimmed output.
:param model: Dataset with ModelAccessor to skim.
:type model: xarray.Dataset
:param metadata: Model metadata, to save as yaml defaults to None.
:type metadata: dict, optional
:param delta: How to group output (None, 'year', 'decade').
:type delta:str, optional
"""
logger.debug("Skimming model with delta {}".format(delta)) logger.debug("Skimming model with delta {}".format(delta))
skimmed = model.model.skim() skimmed = model.model.skim()
if delta == 'year': if delta == 'year':
...@@ -131,6 +179,9 @@ def _skim(model, delta=None): ...@@ -131,6 +179,9 @@ def _skim(model, delta=None):
datasets=[ds.model.vmro3 for ds in datasets], datasets=[ds.model.vmro3 for ds in datasets],
paths=[vmro3_path(year) for year in years] paths=[vmro3_path(year) for year in years]
) )
if metadata:
logger.debug("Creating metadata.yaml file")
utils.save(file_name="metadata.yaml", metadata=metadata)
class TestsSource(unittest.TestCase): class TestsSource(unittest.TestCase):
......
...@@ -74,3 +74,17 @@ def load(yaml_file): ...@@ -74,3 +74,17 @@ def load(yaml_file):
config = yaml.safe_load(ymlfile) config = yaml.safe_load(ymlfile)
logging.debug("Configuration data: %s", config) logging.debug("Configuration data: %s", config)
return config return config
def save(file_name, metadata):
"""Saves the metadata dict on the current folder with yaml
format.
:param file_name: Name for the output yaml file.
:type file_name: str
:param metadata: Dict with the data to save into.
:type metadata: dict
"""
with open(file_name, 'w+') as ymlfile:
yaml.dump(metadata, ymlfile, allow_unicode=True)
...@@ -6,6 +6,7 @@ import o3skim ...@@ -6,6 +6,7 @@ import o3skim
import pytest import pytest
import tests.mockup as mockup_data import tests.mockup as mockup_data
import xarray import xarray
import yaml
# configurations ---------------------------------------------------- # configurations ----------------------------------------------------
year_line = range(2000, 2022) year_line = range(2000, 2022)
...@@ -70,7 +71,7 @@ def source_name(request): ...@@ -70,7 +71,7 @@ def source_name(request):
@pytest.fixture(scope='module') @pytest.fixture(scope='module')
def source(config_dict, source_name, data_dir): def source(config_dict, source_name, data_dir):
with o3skim.cd(data_dir): with o3skim.cd(data_dir):
source = o3skim.Source(source_name, config_dict[source_name]) source = o3skim.Source(source_name, **config_dict[source_name])
return source return source