Commit 5597b0b8 authored by BorjaEst's avatar BorjaEst
Browse files

Merge branch '7-add-metadata-file-to-output' into dev

parents 2530da17 cf003d33
......@@ -49,14 +49,46 @@ argument is configured at the :doc:`../getting_started/cli` call.
# [CUSTOMIZABLE_KEY -- MANDATORY]
CCMI-1:
# Source metadata; common to all models in this source
# [FIXED_KEY -- OPTIONAL]
metadata:
# A metadata information example related to the source
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_0: Source metadata string example
# A metadata information example related to the source
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_1: Source metadata example replaced later by model
# This is the preceded -y1- string at the output folder: '[x1]_[y1]'
# [CUSTOMIZABLE_KEY -- MANDATORY]
IPSL:
# Model metadata; Unique key values for this model
# [FIXED_KEY -- OPTIONAL]
metadata:
# A metadata information example related to the source
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_1: Replaces the metadata from the source
# A metadata information example related to the source
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_2: Model metadata string example
# Represents the information related to tco3 data
# [FIXED_KEY -- OPTIONAL]
tco3_zm:
# TCO3 metadata; metadata for variable tco3
# [FIXED_KEY -- OPTIONAL]
metadata:
# A tco3 metadata attribute example
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_0: Structured as tco3_zm: -> meta_0:
# Variable name for tco3 array inside the dataset
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
name: toz
......@@ -82,6 +114,14 @@ argument is configured at the :doc:`../getting_started/cli` call.
# [FIXED_KEY -- OPTIONAL]
vmro3_zm:
# VMRO3 metadata; metadata for variable vmro3
# [FIXED_KEY -- OPTIONAL]
metadata:
# A vmro3 metadata attribute example
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_0: Structured as vmro3_zm: -> meta_0:
# Variable name for vmro3 array inside the dataset
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
name: vmro3
......@@ -92,8 +132,8 @@ argument is configured at the :doc:`../getting_started/cli` call.
# Coordinates description for vmro3 data.
# [FIXED_KEY -- MANDATORY]:
coordinates:
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
time: time
......@@ -129,95 +169,155 @@ key *path* should be the same for both variables. The output expected at
.. code-block:: yaml
# This is the preceded -x2- string at the output folder: '[x2]_[y-]'
# [CUSTOMIZABLE_KEY -- MANDATORY]
ECMWF:
# This is the preceded -x2- string at the output folder: '[x2]_[y-]'
# [CUSTOMIZABLE_KEY -- MANDATORY]
ECMWF:
# This is the preceded -y1- string at the output folder: '[x2]_[y1]'
# [CUSTOMIZABLE_KEY -- MANDATORY]
ERA-5:
# Source metadata; common to all models in this source
# [FIXED_KEY -- OPTIONAL]
metadata:
# Represents the information related to tco3 data
# [FIXED_KEY -- OPTIONAL]
tco3_zm:
# A metadata information example related to the source
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_0: Source metadata string example
# Variable name for tco3 array inside the dataset
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
name: tco3
# A metadata information example related to the source
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_1: Source metadata example replaced later by model
# Reg expression, how to load the netCDF files
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
paths: Ecmwf/Era5
# This is the preceded -y1- string at the output folder: '[x2]_[y1]'
# [CUSTOMIZABLE_KEY -- MANDATORY]
ERA-5:
# Coordinates description for tco3 data.
# [FIXED_KEY -- MANDATORY]:
coordinates:
# Model metadata; Unique key values for this model
# [FIXED_KEY -- OPTIONAL]
metadata:
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
lat: latitude
# A metadata information example related to the source
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_1: Replaces the metadata from the source
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
lon: longitude
# A metadata information example related to the source
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_2: Model metadata string example
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
time: time
# Represents the information related to tco3 data
# [FIXED_KEY -- OPTIONAL]
tco3_zm:
# This is the preceded -y2- string at the output folder: '[x2]_[y2]'
# [CUSTOMIZABLE_KEY -- MANDATORY]
ERA-i:
# TCO3 metadata; metadata for variable tco3
# [FIXED_KEY -- OPTIONAL]
metadata:
# Represents the information related to tco3 data
# [FIXED_KEY -- OPTIONAL]
tco3_zm:
# A tco3 metadata attribute example
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_0: Structured as tco3_zm: -> meta_0:
# Variable name for tco3 array inside the dataset
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
name: toz
# Variable name for tco3 array inside the dataset
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
name: tco3
# Reg expression, how to load the netCDF files
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
paths: Ecmwf/Erai
# Reg expression, how to load the netCDF files
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
paths: Ecmwf/Era5
# Coordinates description for tco3 data.
# [FIXED_KEY -- MANDATORY]:
coordinates:
# Coordinates description for tco3 data.
# [FIXED_KEY -- MANDATORY]:
coordinates:
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
time: time
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
lat: latitude
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
lat: latitude
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
lon: longitude
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
lon: longitude
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
time: time
# Represents the information related to vmro3 data
# [FIXED_KEY -- OPTIONAL]
vmro3_zm:
# This is the preceded -y2- string at the output folder: '[x2]_[y2]'
# [CUSTOMIZABLE_KEY -- MANDATORY]
ERA-i:
# Variable name for vmro3 array inside the dataset
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
name: vmro3
# Model metadata; Unique key values for this model
# [FIXED_KEY -- OPTIONAL]
metadata:
# Reg expression, how to load the netCDF files
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
paths: Ecmwf/Erai
# A metadata information example related to the source
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_1: Replaces the metadata from the source
# Coordinates description for vmro3 data.
# [FIXED_KEY -- MANDATORY]:
# A metadata information example related to the source
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_2: Model metadata string example
coordinates:
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
time: time
# Represents the information related to tco3 data
# [FIXED_KEY -- OPTIONAL]
tco3_zm:
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
plev: level
# TCO3 metadata; metadata for variable tco3
# [FIXED_KEY -- OPTIONAL]
metadata:
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
lat: latitude
# A tco3 metadata attribute example
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_0: Structured as tco3_zm: -> meta_0:
# Variable name for tco3 array inside the dataset
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
name: toz
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
lon: longitude
# Reg expression, how to load the netCDF files
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
paths: Ecmwf/Erai
# Coordinates description for tco3 data.
# [FIXED_KEY -- MANDATORY]:
coordinates:
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
time: time
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
lat: latitude
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
lon: longitude
# Represents the information related to vmro3 data
# [FIXED_KEY -- OPTIONAL]
vmro3_zm:
# VMRO3 metadata; metadata for variable vmro3
# [FIXED_KEY -- OPTIONAL]
metadata:
# A vmro3 metadata attribute example
# [CUSTOMIZABLE_KEY -- OPTIONAL]
meta_0: Structured as vmro3_zm: -> meta_0:
# Variable name for vmro3 array inside the dataset
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
name: vmro3
# Reg expression, how to load the netCDF files
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
paths: Ecmwf/Erai
# Coordinates description for vmro3 data.
# [FIXED_KEY -- MANDATORY]:
coordinates:
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
time: time
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
plev: level
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
lat: latitude
# [FIXED_KEY -- MANDATORY]: [CORRECT_VALUE -- MANDATORY]
lon: longitude
One or two files?
-----------------
......
......@@ -41,7 +41,7 @@ if __name__ == '__main__':
# Create sources
logging.info("Loading data from './data' ")
with o3skim.cd("data"):
ds = {name: o3skim.Source(name, collection) for
ds = {name: o3skim.Source(name, **collection) for
name, collection in config.items()}
# Skim output
......
......@@ -36,6 +36,14 @@ class ModelAccessor:
else:
return None
@property
def metadata(self):
"""Return the ozone volume mixing ratio of this dataset."""
result = self._model.attrs
for var in self._model.var():
result = {**result, var: self._model[var].attrs}
return result
def groupby_year(self):
"""Returns a grouped dataset by year"""
logger.debug("Performing group by year on model")
......@@ -115,6 +123,14 @@ class Tests(unittest.TestCase):
expected = Tests.vmro3_datarray().to_dataset(name="vmro3_zm")
xr.testing.assert_equal(self.ds.model.vmro3, expected)
def test_metadata_property(self):
metadata = self.ds.model.metadata
self.assertEqual(metadata["description"], "Test dataset")
self.assertEqual(metadata["tco3_zm"]
["description"], "Test tco3 datarray")
self.assertEqual(metadata["vmro3_zm"]
["description"], "Test vmro3 datarray")
def test_groupby_year(self):
groups = self.ds.model.groupby_year()
self.assertEqual(25, len(groups))
......
......@@ -26,27 +26,30 @@ logger = logging.getLogger('source')
class Source:
"""Conceptual class for a data source. It is produced by the loading
and standardization of multiple data models.
r"""Conceptual class for a data source. It is produced by the
loading and standardization of multiple data models.
The current supported model variables are "tco3_zm" and "vmro3_zm",
which should contain the information on how to retrieve the data
from the netCDF collection.
:param name: Name to provide to the source.
:type name: str
:param collections: Dictionary where each 'key' is a model name
and its value another dictionary with the variable loading
statements for that model.
{name:str, paths: str, coordinates: dict}
:type collections: dict
:param metadata: Source metadata, defaults to {}.
:type metadata: dict, optional
:param \**collections: kwarg where each 'key' is the model
name and its 'value' another dictionary with the variable
loading statements for that model.
{name:str, paths: str, coordinates: dict, metadata: dict}
"""
def __init__(self, name, collections):
self.name = name
def __init__(self, name, metadata={}, **collections):
self._name = name
self._metadata = metadata
self._models = {}
logger.info("Loading source '%s'", self.name)
logger.info("Loading source '%s'", name)
for name, specifications in collections.items():
logger.info("Loading model '%s'", name)
model = _load_model(**specifications)
......@@ -56,33 +59,63 @@ class Source:
def __getitem__(self, model_name):
return self._models[model_name]
@property
def name(self):
return self._name
@property
def models(self):
return list(self._models.keys())
@property
def metadata(self):
return self._metadata
def skim(self, groupby=None):
"""Request to skim all source data into the current folder.
The output is generated into multiple folder where
each model output is generated in a forder with the source
name defined at the source initialization followed by
'_' and the model name: "<source_name>_<model_name>"
'_' and the model name: "<source_name>_<model_name>".
If there was metadata added when creating the source, it is
delivered into a "metadata.yaml" file on the directory.
:param groupby: How to group output (None, 'year', 'decade').
:type groupby: str, optional
"""
for model in self._models:
dirname = "{source}_{model}".format(source=self.name, model=model)
dirname = "{}_{}".format(self._name, model)
os.makedirs(dirname, exist_ok=True)
logger.info("Skimming data from '%s'", dirname)
with utils.cd(dirname):
_skim(self[model], delta=groupby)
source_metadata = self.metadata
model_metadata = self[model].model.metadata
metadata = {**source_metadata, **model_metadata}
_skim(self[model], delta=groupby, metadata=metadata)
@utils.return_on_failure("Error when loading model", default=None)
def _load_model(tco3_zm=None, vmro3_zm=None):
"""Loads a model merging standardized data from specified datasets."""
dataset = xr.Dataset()
def _load_model(tco3_zm=None, vmro3_zm=None, metadata={}):
"""Loads a model merging standardized data from specified datasets.
:param tco3_zm: tco3 variable description, defaults to None.
:type tco3_zm: {name:str, paths:str,
coordinates:{lat:str, lon:str, time:str}},
optional
:param vmro3_zm: vmro3 variable description, defaults to None.
:type vmro3_zm: {name:str, paths:str,
coordinates:{lat:str, lon:str, plev:str time:str}},
optional
:param metadata: Source metadata, defaults to {}.
:type metadata: dict, optional
:return: Dataset with specified variables.
:rtype: xarray.Dataset
"""
dataset = xr.Dataset(attrs=metadata)
if tco3_zm:
logger.debug("Loading tco3_zm into model")
with xr.open_mfdataset(tco3_zm['paths']) as load:
......@@ -91,6 +124,7 @@ def _load_model(tco3_zm=None, vmro3_zm=None):
variable=tco3_zm['name'],
coordinates=tco3_zm['coordinates'])
dataset = dataset.merge(standardized)
dataset.tco3_zm.attrs = tco3_zm.get('metadata', {})
if vmro3_zm:
logger.debug("Loading vmro3_zm into model")
with xr.open_mfdataset(vmro3_zm['paths']) as load:
......@@ -99,11 +133,25 @@ def _load_model(tco3_zm=None, vmro3_zm=None):
variable=vmro3_zm['name'],
coordinates=vmro3_zm['coordinates'])
dataset = dataset.merge(standardized)
dataset.vmro3_zm.attrs = vmro3_zm.get('metadata', {})
return dataset
def _skim(model, delta=None):
"""Skims model producing reduced dataset files"""
def _skim(model, delta=None, metadata=None):
"""Skims model producing reduced dataset files. It is possible to
indicate the time to split the output by 'delta'. If metadata is
introduced in the form of dict, a 'metadata.yaml' file is
generated together with the skimmed output.
:param model: Dataset with ModelAccessor to skim.
:type model: xarray.Dataset
:param metadata: Model metadata, to save as yaml defaults to None.
:type metadata: dict, optional
:param delta: How to group output (None, 'year', 'decade').
:type delta:str, optional
"""
logger.debug("Skimming model with delta {}".format(delta))
skimmed = model.model.skim()
if delta == 'year':
......@@ -131,6 +179,9 @@ def _skim(model, delta=None):
datasets=[ds.model.vmro3 for ds in datasets],
paths=[vmro3_path(year) for year in years]
)
if metadata:
logger.debug("Creating metadata.yaml file")
utils.save(file_name="metadata.yaml", metadata=metadata)
class TestsSource(unittest.TestCase):
......
......@@ -74,3 +74,17 @@ def load(yaml_file):
config = yaml.safe_load(ymlfile)
logging.debug("Configuration data: %s", config)
return config
def save(file_name, metadata):
"""Saves the metadata dict on the current folder with yaml
format.
:param file_name: Name for the output yaml file.
:type file_name: str
:param metadata: Dict with the data to save into.
:type metadata: dict
"""
with open(file_name, 'w+') as ymlfile:
yaml.dump(metadata, ymlfile, allow_unicode=True)
......@@ -6,6 +6,7 @@ import o3skim
import pytest
import tests.mockup as mockup_data
import xarray
import yaml
# configurations ----------------------------------------------------
year_line = range(2000, 2022)
......@@ -70,7 +71,7 @@ def source_name(request):
@pytest.fixture(scope='module')
def source(config_dict, source_name, data_dir):
with o3skim.cd(data_dir):
source = o3skim.Source(source_name, config_dict[source_name])
source = o3skim.Source(source_name, **config_dict[source_name])
return source
......@@ -105,6 +106,20 @@ def variable(request):
return request.param
@pytest.fixture()
def metadata_file(skimmed, model_name):
_, source_name = skimmed
with o3skim.cd("{}_{}".format(source_name, model_name)):
yield "metadata.yaml"
@pytest.fixture()
def metadata_dict(metadata_file):
with open(metadata_file, "r") as ymlfile:
config = yaml.safe_load(ymlfile)
return config
@pytest.fixture()
def skimed_file(skimmed, model_name, variable, year):
groupby, source_name = skimmed
......
SourceSplit:
metadata:
meta_0: Source metadata string example
meta_1: Source metadata to be replaced by model
ModelTCO3:
metadata:
meta_1: Model metadata string example
meta_2: 0
tco3_zm:
name: tco3
paths: SourceSplit/tco3_????.nc
......@@ -7,7 +13,13 @@ SourceSplit:
time: time
lat: latitude
lon: longitude
metadata:
meta_tco3_1: TCO3 metadata string example
meta_tco3_2: 0
ModelVMRO3:
metadata:
meta_1: Model metadata string example
meta_2: 0
vmro3_zm:
name: vmro3
paths: SourceSplit/vmro3_????.nc
......@@ -16,7 +28,13 @@ SourceSplit:
plev: pressure_level
lat: latitude
lon: longitude
metadata:
meta_vmro3_1: VMRO3 metadata string example
meta_vmro3_2: 0
ModelALL:
metadata:
meta_1: Model metadata string example
meta_2: 0
tco3_zm:
name: tco3
paths: SourceSplit/tco3_????.nc
......@@ -24,6 +42,9 @@ SourceSplit:
time: time
lat: latitude
lon: longitude
metadata:
meta_tco3_1: TCO3 metadata string example
meta_tco3_2: 0
vmro3_zm:
name: vmro3
paths: SourceSplit/vmro3_????.nc
......@@ -31,9 +52,18 @@ SourceSplit:
time: time
plev: pressure_level
lat: latitude
lon: longitude
lon: longitude
metadata:
meta_vmro3_1: VMRO3 metadata string example
meta_vmro3_2: 0
SourceMerged:
metadata:
meta_0: Source metadata string example
meta_1: Source metadata to be replaced by model
ModelTCO3:
metadata:
meta_1: Model metadata string example
meta_2: 0
tco3_zm:
name: tco3
paths: SourceMerged/merged_????.nc
......@@ -41,7 +71,13 @@ SourceMerged:
time: time
lat: latitude
lon: longitude
metadata:
meta_tco3_1: TCO3 metadata string example
meta_tco3_2: 0
ModelVMRO3:
metadata:
meta_1: Model metadata string example
meta_2: 0
vmro3_zm:
name: vmro3
paths: SourceMerged/merged_????.nc
......@@ -50,7 +86,13 @@ SourceMerged:
plev: pressure_level
lat: latitude
lon: longitude
metadata:
meta_vmro3_1: VMRO3 metadata string example
meta_vmro3_2: 0