From 83267c441325919a2c1aa0cf3eb6dc388d5b36da Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Fri, 15 Jul 2022 09:08:47 +0200
Subject: [PATCH 01/31] Add files to folder structure

---
 setup.py            | 0
 trainer/__init__.py | 0
 trainer/trainer.py  | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 setup.py
 create mode 100644 trainer/__init__.py
 create mode 100644 trainer/trainer.py

diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..e69de29
diff --git a/trainer/__init__.py b/trainer/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/trainer/trainer.py b/trainer/trainer.py
new file mode 100644
index 0000000..e69de29
-- 
GitLab


From b3197f225a0a0eaab1e277dc43485471eca6c13c Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Fri, 15 Jul 2022 09:27:45 +0200
Subject: [PATCH 02/31] Add setup.py

---
 setup.py                                | 11 +++++++++++
 trainer/__init__.py => yaket/README.md  |  0
 trainer/trainer.py => yaket/__init__.py |  0
 yaket/trainer.py                        |  0
 4 files changed, 11 insertions(+)
 rename trainer/__init__.py => yaket/README.md (100%)
 rename trainer/trainer.py => yaket/__init__.py (100%)
 create mode 100644 yaket/trainer.py

diff --git a/setup.py b/setup.py
index e69de29..17c8231 100644
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,11 @@
+from setuptools import setup, find_packages
+
+setup(name = 'yaket',
+      version = '0.0.1',
+      description = 'YAml KEras Trainer for quick AI development',
+      author = 'Andrea Favia',
+      author_email = 'andrea.favia@pm.me',
+      url = '',
+      packages = find_packages(include = ['yaket', 'yaket.*']),
+      setup_requires = ['flake8'],
+      )
diff --git a/trainer/__init__.py b/yaket/README.md
similarity index 100%
rename from trainer/__init__.py
rename to yaket/README.md
diff --git a/trainer/trainer.py b/yaket/__init__.py
similarity index 100%
rename from trainer/trainer.py
rename to yaket/__init__.py
diff --git a/yaket/trainer.py b/yaket/trainer.py
new file mode 100644
index 0000000..e69de29
-- 
GitLab


From 0daceae8f204963e0af7a47e3838e3c6f3f3be8e Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Fri, 15 Jul 2022 09:31:00 +0200
Subject: [PATCH 03/31] Add .gitignore file

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..56e9397
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.eggs
+*.egg-info
\ No newline at end of file
-- 
GitLab


From 4e858c43d5b61422f1fc49e648efc019507d0379 Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Fri, 15 Jul 2022 09:37:11 +0200
Subject: [PATCH 04/31] Add Pydantic schema

---
 yaket/schema/__init__.py | 35 +++++++++++++++++++++++++++++++++++
 yaket/schema/schema.py   |  0
 2 files changed, 35 insertions(+)
 create mode 100644 yaket/schema/__init__.py
 create mode 100644 yaket/schema/schema.py

diff --git a/yaket/schema/__init__.py b/yaket/schema/__init__.py
new file mode 100644
index 0000000..ab3fe33
--- /dev/null
+++ b/yaket/schema/__init__.py
@@ -0,0 +1,35 @@
+from pydantic_yaml import YamlModel
+from typing import Dict, Optional, Any
+from pydantic import (
+    BaseModel,
+    validator,
+    Extra,
+    PositiveInt,
+    conint,
+    conlist,
+    constr,
+    FilePath,
+    DirectoryPath,
+    Field,
+)
+
+
+#pydantic-yaml-0.8.0
+
+
+
+class Training(BaseModel, extra=Extra.allow):
+    autolog: bool
+    optimizer: constr(strict=True)
+    optimizer_params: Optional[Dict[str, Any]] = None
+    metrics: conlist(item_type=str, min_items=1, unique_items=True)
+    epochs: PositiveInt
+    batch_size: PositiveInt  # if format is numpy
+    loss: constr(strict=True)
+    callbacks: conlist(item_type=Dict[str, Any], min_items=0)
+    verbose: conint(ge=1, le=2)
+    shuffle: bool
+    class_weights: conlist(item_type=Any, min_items=1)
+
+class TrainerModel(YamlModel, extra=Extra.allow):
+    training: Training = Field(...)
\ No newline at end of file
diff --git a/yaket/schema/schema.py b/yaket/schema/schema.py
new file mode 100644
index 0000000..e69de29
-- 
GitLab


From 610276ac56f5fd8547ae7fd3d570100ee7bdd463 Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Fri, 15 Jul 2022 09:49:01 +0200
Subject: [PATCH 05/31] Move code to __init__ file

---
 .gitignore               |  3 ++-
 yaket/schema/__init__.py | 35 -----------------------------------
 yaket/schema/schema.py   | 35 +++++++++++++++++++++++++++++++++++
 yaket/trainer.py         |  8 ++++++++
 4 files changed, 45 insertions(+), 36 deletions(-)

diff --git a/.gitignore b/.gitignore
index 56e9397..399bf96 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 *.eggs
-*.egg-info
\ No newline at end of file
+*.egg-info
+yaket/schema/__pycache__/
diff --git a/yaket/schema/__init__.py b/yaket/schema/__init__.py
index ab3fe33..e69de29 100644
--- a/yaket/schema/__init__.py
+++ b/yaket/schema/__init__.py
@@ -1,35 +0,0 @@
-from pydantic_yaml import YamlModel
-from typing import Dict, Optional, Any
-from pydantic import (
-    BaseModel,
-    validator,
-    Extra,
-    PositiveInt,
-    conint,
-    conlist,
-    constr,
-    FilePath,
-    DirectoryPath,
-    Field,
-)
-
-
-#pydantic-yaml-0.8.0
-
-
-
-class Training(BaseModel, extra=Extra.allow):
-    autolog: bool
-    optimizer: constr(strict=True)
-    optimizer_params: Optional[Dict[str, Any]] = None
-    metrics: conlist(item_type=str, min_items=1, unique_items=True)
-    epochs: PositiveInt
-    batch_size: PositiveInt  # if format is numpy
-    loss: constr(strict=True)
-    callbacks: conlist(item_type=Dict[str, Any], min_items=0)
-    verbose: conint(ge=1, le=2)
-    shuffle: bool
-    class_weights: conlist(item_type=Any, min_items=1)
-
-class TrainerModel(YamlModel, extra=Extra.allow):
-    training: Training = Field(...)
\ No newline at end of file
diff --git a/yaket/schema/schema.py b/yaket/schema/schema.py
index e69de29..ab3fe33 100644
--- a/yaket/schema/schema.py
+++ b/yaket/schema/schema.py
@@ -0,0 +1,35 @@
+from pydantic_yaml import YamlModel
+from typing import Dict, Optional, Any
+from pydantic import (
+    BaseModel,
+    validator,
+    Extra,
+    PositiveInt,
+    conint,
+    conlist,
+    constr,
+    FilePath,
+    DirectoryPath,
+    Field,
+)
+
+
+#pydantic-yaml-0.8.0
+
+
+
+class Training(BaseModel, extra=Extra.allow):
+    autolog: bool
+    optimizer: constr(strict=True)
+    optimizer_params: Optional[Dict[str, Any]] = None
+    metrics: conlist(item_type=str, min_items=1, unique_items=True)
+    epochs: PositiveInt
+    batch_size: PositiveInt  # if format is numpy
+    loss: constr(strict=True)
+    callbacks: conlist(item_type=Dict[str, Any], min_items=0)
+    verbose: conint(ge=1, le=2)
+    shuffle: bool
+    class_weights: conlist(item_type=Any, min_items=1)
+
+class TrainerModel(YamlModel, extra=Extra.allow):
+    training: Training = Field(...)
\ No newline at end of file
diff --git a/yaket/trainer.py b/yaket/trainer.py
index e69de29..4ab8753 100644
--- a/yaket/trainer.py
+++ b/yaket/trainer.py
@@ -0,0 +1,8 @@
+import os
+import sys
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import tensorflow as tf
+import gc
+import warnings
+from schema.schema import TrainerModel
-- 
GitLab


From 99b00eaa1e5564842b56208b9d5847423625f0d8 Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Fri, 15 Jul 2022 13:14:51 +0200
Subject: [PATCH 06/31] Add pydantic schema

---
 .gitignore             |  1 +
 setup.py               |  2 ++
 yaket/schema/schema.py | 27 ++++++++++++++++++---------
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/.gitignore b/.gitignore
index 399bf96..d80ca5b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 *.eggs
 *.egg-info
 yaket/schema/__pycache__/
+Pipfile
diff --git a/setup.py b/setup.py
index 17c8231..c43541a 100644
--- a/setup.py
+++ b/setup.py
@@ -8,4 +8,6 @@ setup(name = 'yaket',
       url = '',
       packages = find_packages(include = ['yaket', 'yaket.*']),
       setup_requires = ['flake8'],
+      install_requires=['pydantic','pyyaml'],
+      extras_require = {"tensorflow": ["tensorflow>=2.4"]},
       )
diff --git a/yaket/schema/schema.py b/yaket/schema/schema.py
index ab3fe33..9b2bd56 100644
--- a/yaket/schema/schema.py
+++ b/yaket/schema/schema.py
@@ -1,5 +1,6 @@
-from pydantic_yaml import YamlModel
 from typing import Dict, Optional, Any
+import yaml
+import os
 from pydantic import (
     BaseModel,
     validator,
@@ -13,12 +14,7 @@ from pydantic import (
     Field,
 )
 
-
-#pydantic-yaml-0.8.0
-
-
-
-class Training(BaseModel, extra=Extra.allow):
+class TrainingModel(BaseModel, extra=Extra.allow):
     autolog: bool
     optimizer: constr(strict=True)
     optimizer_params: Optional[Dict[str, Any]] = None
@@ -31,5 +27,18 @@ class Training(BaseModel, extra=Extra.allow):
     shuffle: bool
     class_weights: conlist(item_type=Any, min_items=1)
 
-class TrainerModel(YamlModel, extra=Extra.allow):
-    training: Training = Field(...)
\ No newline at end of file
+def yaml_to_pydantic(path: str) ->  TrainingModel:
+    if not os.path.exists(path=path):
+        raise FileNotFoundError(f"{path} not found")
+    if os.path.isfile(path) and path.endswith(".yaml"):
+        with open(path, "r") as f:
+            data = yaml.safe_load(f, Loader=yaml.FullLoader)
+        return TrainingModel(**data)
+    else:
+        raise ValueError(f"{path} is not a yaml file")
+        
+
+
+
+
+        
-- 
GitLab


From 02efd49e9cedb5432f787b16dc99cfacd1fc4918 Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Fri, 15 Jul 2022 14:19:38 +0200
Subject: [PATCH 07/31] Add skeleton Trainer

---
 .vscode/settings.json  |   3 +
 setup.py               |   2 +-
 yaket/schema/schema.py |  16 +--
 yaket/trainer.py       | 234 ++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 241 insertions(+), 14 deletions(-)
 create mode 100644 .vscode/settings.json

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..de288e1
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.formatting.provider": "black"
+}
\ No newline at end of file
diff --git a/setup.py b/setup.py
index c43541a..5601580 100644
--- a/setup.py
+++ b/setup.py
@@ -8,6 +8,6 @@ setup(name = 'yaket',
       url = '',
       packages = find_packages(include = ['yaket', 'yaket.*']),
       setup_requires = ['flake8'],
-      install_requires=['pydantic','pyyaml'],
+      install_requires=['pydantic','pyyaml','mlflow'],
       extras_require = {"tensorflow": ["tensorflow>=2.4"]},
       )
diff --git a/yaket/schema/schema.py b/yaket/schema/schema.py
index 9b2bd56..7675061 100644
--- a/yaket/schema/schema.py
+++ b/yaket/schema/schema.py
@@ -14,6 +14,7 @@ from pydantic import (
     Field,
 )
 
+
 class TrainingModel(BaseModel, extra=Extra.allow):
     autolog: bool
     optimizer: constr(strict=True)
@@ -27,18 +28,17 @@ class TrainingModel(BaseModel, extra=Extra.allow):
     shuffle: bool
     class_weights: conlist(item_type=Any, min_items=1)
 
-def yaml_to_pydantic(path: str) ->  TrainingModel:
+
+def yaml_to_pydantic(path: str, validate: bool) -> TrainingModel:
     if not os.path.exists(path=path):
         raise FileNotFoundError(f"{path} not found")
     if os.path.isfile(path) and path.endswith(".yaml"):
         with open(path, "r") as f:
             data = yaml.safe_load(f, Loader=yaml.FullLoader)
-        return TrainingModel(**data)
+        return (
+            TrainingModel(**data)
+            if validate
+            else TrainingModel.construct(_fields_set=None, **data)
+        )
     else:
         raise ValueError(f"{path} is not a yaml file")
-        
-
-
-
-
-        
diff --git a/yaket/trainer.py b/yaket/trainer.py
index 4ab8753..dde3c76 100644
--- a/yaket/trainer.py
+++ b/yaket/trainer.py
@@ -1,8 +1,232 @@
-import os
-import sys
-from typing import List, Optional, Tuple, Union
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union, Any, Dict, Callable
 import numpy as np
 import tensorflow as tf
 import gc
-import warnings
-from schema.schema import TrainerModel
+from schema.schema import TrainingModel, yaml_to_pydantic
+import importlib
+import mlflow
+
+
+@dataclass
+class Trainer:
+    config_path: str
+    model: tf.keras.Model
+    train_dataset: Union[Tuple[np.ndarray, np.ndarray], tf.data.Dataset]
+    dev_dataset: Union[Tuple[np.ndarray, np.ndarray], tf.data.Dataset]
+    strategy: Optional[tf.distribute.Strategy] = None
+    random_seed: int = 1234
+    validate_yaml: bool = True
+    custom_modules_path: Optional[str] = None
+
+    # internals
+    _config: TrainingModel = None
+    _input_shape: Tuple[int, ...] = None
+    _metrics = List[Union[tf.keras.metrics.Metric, Callable]]
+    _callbacks: List[tf.keras.callbacks.Callback] = None
+    _optimizer: tf.keras.optimizers.Optimizer = None
+    _loss: Union[tf.keras.losses.Loss, Callable] = None
+    _custom_module: Callable = None
+    _history: Dict[str, Any] = None
+
+    def _init_trainer(self) -> None:
+        self._config = self._parse_config()
+        self._validate_config_file()
+        if self.custom_modules_path:
+            self._import_custom_model(self.custom_modules_path)
+        self._callbacks = self._get_callbacks()
+        self._optimizer = self._get_optimizer()
+        self._loss = self._get_loss()
+        self._metrics = self._get_metrics()
+        self._input_shape = self._get_input_shape()
+
+    def train(self):
+        """Train the model. Main function to call
+        
+        TODO: 
+        1. Add save_model option based on checkpoint 
+        2. Add tf.distribute.Strategy
+        """
+        self._init_trainer()
+        self._compile_model()
+        self._autolog()
+
+        x, y, batch_size = self._get_x_y_train()  # handle the format of the dataset
+        val_dataset = self._get_x_y_val()
+
+        history = self.model.fit(
+            x=x,
+            y=y,
+            epochs=int(self.config.training.epochs),
+            validation_data=val_dataset,
+            batch_size=batch_size,
+            callbacks=self._callbacks,
+            class_weight=self.class_weight_values,
+            verbose=int(self.config.training.verbose),
+        )
+
+        self._history = history.history
+        self._clean_workspace()
+        return history
+
+    def _get_input_shape(self):
+        """Get the input shape of input dataset"""
+        if isinstance(self.train_dataset, tf.data.Dataset):
+            for x, y in self.train_dataset.take(1):
+                self.input_shape = (None, *x.shape[1:])
+        else:
+            self.input_shape = (None, *self.train_dataset[0].shape[1:])
+        return self.input_shape
+
+    def _get_x_y_val(self):
+        if self.val_dataset is None:
+            return None
+        if isinstance(self.val_dataset, tf.data.Dataset):
+            return self.val_dataset
+        else:
+            val = tf.data.Dataset.from_tensor_slices(self.val_dataset).batch(1)
+            return val
+
+    def _get_x_y_train(self):
+        """Get the x and y for training based on the format of the dataset"""
+        y, batch_size = None, None
+        if isinstance(self.train_dataset, tf.data.Dataset):
+            x = self.train_dataset
+        else:
+            x = tf.data.Dataset.from_tensor_slices(self.train_dataset)
+            if self.config.training.shuffle:
+                x = x.shuffle(self.train_dataset[0].shape[0])
+            x = x.batch(self.config.training.batch_size).prefetch(1)
+        return x, y, batch_size
+
+    @property
+    def config(self):
+        return self._config
+
+    def _compile_model(self) -> None:
+        """Compile the model"""
+        self.model.compile(
+            optimizer=self._optimizer, loss=self._loss, metrics=self._get_metrics()
+        )
+
+    def _get_strategy(self):
+        if self.strategy is None:
+            return tf.distribute.MirroredStrategy()
+        else:
+            return self.strategy
+
+    def _parse_config(self) -> Any:
+        return yaml_to_pydantic(self.config_path, self.validate_yaml)
+
+    def _validate_config_file(self):
+        "Validate existence of the loss, optimizer and callbacks defined in the config file"
+        try:
+            self._get_optimizer()
+            self._get_metrics()
+            self._get_loss()
+        except Exception as e:
+            raise TypeError(
+                f"You are using a module not defined in either keras or in the custom script\n Details: {e}"
+            )
+
+    def _import_custom_model(self, module_name: str):
+        try:
+            self._custom_module = importlib.import_module(module_name)
+        except Exception as e:
+            raise ImportError(f"Error importing {module_name}: {e}")
+
+    def _load_custom_module(
+        self, module_name: str, params: Optional[Dict] = None
+    ) -> Callable:
+        try:
+            if params is None:
+                return getattr(self._custom_module, module_name)
+            else:
+                return getattr(self._custom_module, module_name)(**params)
+        except Exception as e:
+            raise ImportError(f"Error importing {module_name}: It does not exist")
+
+    def _get_optimizer(self) -> tf.keras.optimizers.Optimizer:
+        opt_pars = self.config.training.optimizer_params
+        default_value = "not_found"
+        optimizer = getattr(
+            tf.keras.optimizers, f"{self.config.training.optimizer}", default_value
+        )
+        if isinstance(optimizer, tf.keras.optimizers.Optimizer):
+            return optimizer(opt_pars)
+        else:
+            return self._load_custom_module(optimizer, opt_pars)
+
+    def _get_loss(self) -> Union[tf.keras.losses.Loss, Callable]:
+        loss_name = self.config.training.loss
+        loss = getattr(tf.keras.losses, loss_name, "not_found")
+        if isinstance(loss, tf.keras.losses.Loss):
+            return loss
+        else:  # it's a custom loss
+            return self._load_custom_module(loss_name)
+
+    def _get_callbacks(self) -> List[tf.keras.callbacks.Callback]:
+        callbacks = []
+        for name_callback in self.config.training.callbacks:
+            key = list(name_callback.keys())[0]
+            args = list(name_callback.values())[0]
+
+            callback_value = getattr(tf.keras.callbacks, key, "not_found")
+            if isinstance(callback_value, tf.keras.callbacks.Callback):
+                callbacks.append(callback_value(**args))
+            else:
+                callbacks.append(self._load_custom_module(key, args))
+
+        return callbacks
+
+    def _get_metrics(self) -> List[Union[tf.keras.metrics.Metric, Callable]]:
+        """Get the metrics"""
+        list_metrics = []
+        for metric in self.config.training.metrics:
+            if metric is None:
+                continue
+            metric_value = getattr(tf.keras.metrics, f"{metric}", "not_found")
+            if isinstance(metric_value, tf.keras.metrics.Metric):
+                list_metrics.append(metric_value())
+            else:
+                list_metrics.append(self._load_custom_module(metric))
+        return list_metrics
+
+    @staticmethod
+    def list_available_tf_modules(option: str):
+        """List available optimizers, losses, and metrics in tf.keras"""
+        options_func = {
+            "optimizers": tf.keras.optimizers,
+            "losses": tf.keras.losses,
+            "metrics": tf.keras.metrics,
+        }
+        assert option in list(options_func.keys())
+        modules = [value for value in dir(options_func[option]) if value[0].isupper()]
+        return modules
+
+    def _clean_workspace(self) -> None:
+        """Clean the workspace"""
+        tf.keras.backend.clear_session()
+
+    def _autolog(self) -> None:
+        """Autolog the model"""
+        if self.config.training.autolog:
+            mlflow.tensorflow.autolog(log_models=False, disable=False)
+
+    def _set_randomness(self, random_seed: Optional[int] = None) -> None:
+        """Set the randomness"""
+        if random_seed is not None:
+            tf.random.set_seed(random_seed)
+            np.random.seed(random_seed)
+
+    def clear_ram(self):
+        "Delete model and all datasets saved in the Trainer class"
+        del self.model
+        del self.train_dataset
+        del self.val_dataset
+        del self.test_dataset
+        gc.collect()
+
+    def summary_model(self):
+        """Summary of the model"""
+        self.model.summary()
-- 
GitLab


From 17f323ae1704c72f38fd1ec44f39b3c9d1df875c Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Fri, 15 Jul 2022 14:37:11 +0200
Subject: [PATCH 08/31] Add docstring

---
 yaket/trainer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/yaket/trainer.py b/yaket/trainer.py
index dde3c76..49d34de 100644
--- a/yaket/trainer.py
+++ b/yaket/trainer.py
@@ -30,6 +30,8 @@ class Trainer:
     _history: Dict[str, Any] = None
 
     def _init_trainer(self) -> None:
+        """Initialize the trainer
+        TODO: Add checks + exceptions"""
         self._config = self._parse_config()
         self._validate_config_file()
         if self.custom_modules_path:
-- 
GitLab


From bc746c03e5545d1fad565379288475c5a4db0b2e Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Fri, 15 Jul 2022 17:39:00 +0200
Subject: [PATCH 09/31] Add example convnet + made metrics/callbacks optional

---
 .gitignore                             |   3 +
 examples/00-simple-mnist-convnet.ipynb | 198 +++++++++++++++++++++++++
 examples/files/trainer.yaml            |  32 ++++
 yaket/schema/schema.py                 |   9 +-
 yaket/trainer.py                       | 163 +++++++++++++++-----
 5 files changed, 368 insertions(+), 37 deletions(-)
 create mode 100644 examples/00-simple-mnist-convnet.ipynb
 create mode 100644 examples/files/trainer.yaml

diff --git a/.gitignore b/.gitignore
index d80ca5b..26b3885 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,6 @@
 *.egg-info
 yaket/schema/__pycache__/
 Pipfile
+mlruns/
+.vscode/launch.json
+yaket/__pycache__/
diff --git a/examples/00-simple-mnist-convnet.ipynb b/examples/00-simple-mnist-convnet.ipynb
new file mode 100644
index 0000000..119fbc3
--- /dev/null
+++ b/examples/00-simple-mnist-convnet.ipynb
@@ -0,0 +1,198 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-07-15 17:35:22.701833: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2022-07-15 17:35:22.706537: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
+      "2022-07-15 17:35:22.706559: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "from tensorflow import keras\n",
+    "from tensorflow.keras import layers\n",
+    "from yaket.trainer import Trainer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "x_train shape: (60000, 28, 28, 1)\n",
+      "60000 train samples\n",
+      "10000 test samples\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Model / data parameters\n",
+    "num_classes = 10\n",
+    "input_shape = (28, 28, 1)\n",
+    "\n",
+    "# Load the data and split it between train and test sets\n",
+    "(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()\n",
+    "\n",
+    "# Scale images to the [0, 1] range\n",
+    "x_train = x_train.astype(\"float32\") / 255\n",
+    "x_test = x_test.astype(\"float32\") / 255\n",
+    "# Make sure images have shape (28, 28, 1)\n",
+    "x_train = np.expand_dims(x_train, -1)\n",
+    "x_test = np.expand_dims(x_test, -1)\n",
+    "print(\"x_train shape:\", x_train.shape)\n",
+    "print(x_train.shape[0], \"train samples\")\n",
+    "print(x_test.shape[0], \"test samples\")\n",
+    "\n",
+    "\n",
+    "# convert class vectors to binary class matrices\n",
+    "y_train = keras.utils.to_categorical(y_train, num_classes)\n",
+    "y_test = keras.utils.to_categorical(y_test, num_classes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model: \"sequential\"\n",
+      "_________________________________________________________________\n",
+      " Layer (type)                Output Shape              Param #   \n",
+      "=================================================================\n",
+      " conv2d (Conv2D)             (None, 26, 26, 32)        320       \n",
+      "                                                                 \n",
+      " max_pooling2d (MaxPooling2D  (None, 13, 13, 32)       0         \n",
+      " )                                                               \n",
+      "                                                                 \n",
+      " conv2d_1 (Conv2D)           (None, 11, 11, 64)        18496     \n",
+      "                                                                 \n",
+      " max_pooling2d_1 (MaxPooling  (None, 5, 5, 64)         0         \n",
+      " 2D)                                                             \n",
+      "                                                                 \n",
+      " flatten (Flatten)           (None, 1600)              0         \n",
+      "                                                                 \n",
+      " dropout (Dropout)           (None, 1600)              0         \n",
+      "                                                                 \n",
+      " dense (Dense)               (None, 10)                16010     \n",
+      "                                                                 \n",
+      "=================================================================\n",
+      "Total params: 34,826\n",
+      "Trainable params: 34,826\n",
+      "Non-trainable params: 0\n",
+      "_________________________________________________________________\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-07-15 17:35:25.479384: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory\n",
+      "2022-07-15 17:35:25.479497: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)\n",
+      "2022-07-15 17:35:25.479532: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (dc8739c0aa9a): /proc/driver/nvidia/version does not exist\n",
+      "2022-07-15 17:35:25.480060: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA\n",
+      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = keras.Sequential(\n",
+    "    [\n",
+    "        keras.Input(shape=input_shape),\n",
+    "        layers.Conv2D(32, kernel_size=(3, 3), activation=\"relu\"),\n",
+    "        layers.MaxPooling2D(pool_size=(2, 2)),\n",
+    "        layers.Conv2D(64, kernel_size=(3, 3), activation=\"relu\"),\n",
+    "        layers.MaxPooling2D(pool_size=(2, 2)),\n",
+    "        layers.Flatten(),\n",
+    "        layers.Dropout(0.5),\n",
+    "        layers.Dense(num_classes, activation=\"softmax\"),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path = '/root/project/yaket/examples/files/trainer.yaml'\n",
+    "    \n",
+    "trainer = Trainer(config_path = path, train_dataset=(x_train, y_train), val_dataset=(x_test, y_test), model=model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "'NoneType' object is not iterable",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[1;32m/root/project/yaket/examples/00-simple-mnist-convnet.ipynb Cell 5\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> <a href='vscode-notebook-cell://attached-container%2B7b22636f6e7461696e65724e616d65223a222f7562756e74752d61614b45227d/root/project/yaket/examples/00-simple-mnist-convnet.ipynb#ch0000010vscode-remote?line=0'>1</a>\u001b[0m trainer\u001b[39m.\u001b[39;49m_init_trainer()\n",
+      "File \u001b[0;32m~/project/yaket/yaket/trainer.py:44\u001b[0m, in \u001b[0;36mTrainer._init_trainer\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     42\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcustom_modules_path:\n\u001b[1;32m     43\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_import_custom_model(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcustom_modules_path)\n\u001b[0;32m---> 44\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_callbacks \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_get_callbacks()\n\u001b[1;32m     45\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_input_shape \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_get_input_shape()\n",
+      "File \u001b[0;32m~/project/yaket/yaket/trainer.py:214\u001b[0m, in \u001b[0;36mTrainer._get_callbacks\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    211\u001b[0m callbacks \u001b[39m=\u001b[39m []\n\u001b[1;32m    212\u001b[0m default_value \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mnot_found\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m--> 214\u001b[0m \u001b[39mfor\u001b[39;00m name_callback \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39mcallbacks:\n\u001b[1;32m    215\u001b[0m     key \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(name_callback\u001b[39m.\u001b[39mkeys())[\u001b[39m0\u001b[39m]\n\u001b[1;32m    216\u001b[0m     args \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(name_callback\u001b[39m.\u001b[39mvalues())[\u001b[39m0\u001b[39m]\n",
+      "\u001b[0;31mTypeError\u001b[0m: 'NoneType' object is not iterable"
+     ]
+    }
+   ],
+   "source": [
+    "trainer._init_trainer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.4 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/files/trainer.yaml b/examples/files/trainer.yaml
new file mode 100644
index 0000000..e8abe1e
--- /dev/null
+++ b/examples/files/trainer.yaml
@@ -0,0 +1,32 @@
+autolog: False
+optimizer: Adam # SGOptimizer
+optimizer_params:
+    learning_rate: 0.01
+# metrics:
+# - Accuracy
+batch_size: 128 
+loss: CategoricalCrossentropy 
+# callbacks:
+#     # - EarlyStopping:
+#     #     monitor: val_loss 
+#     #     patience: 10
+#     #     mode: min
+#     #     verbose: 1
+#     # - ReduceLROnPlateau:
+#     #     monitor: val_loss
+#     #     mode: min
+#     #     patience: 5
+#     #     verbose: 1
+#     - ModelCheckpoint:
+#         monitor: val_accuracy
+#         mode: max
+#         verbose: 1
+#         save_best_only: True
+#         save_weights_only: True
+#         filepath: '/tmp/checkpoint'
+        
+verbose: 2 # 0, 1, 2 
+epochs: 15
+shuffle: False
+class_weights: # First value is boolean
+    - False
diff --git a/yaket/schema/schema.py b/yaket/schema/schema.py
index 7675061..0b80756 100644
--- a/yaket/schema/schema.py
+++ b/yaket/schema/schema.py
@@ -19,14 +19,15 @@ class TrainingModel(BaseModel, extra=Extra.allow):
     autolog: bool
     optimizer: constr(strict=True)
     optimizer_params: Optional[Dict[str, Any]] = None
-    metrics: conlist(item_type=str, min_items=1, unique_items=True)
+    
     epochs: PositiveInt
     batch_size: PositiveInt  # if format is numpy
     loss: constr(strict=True)
-    callbacks: conlist(item_type=Dict[str, Any], min_items=0)
+    callbacks: Optional[conlist(item_type=Dict[str, Any], min_items=0)] 
+    metrics: Optional[conlist(item_type=str, min_items=1, unique_items=True)]
     verbose: conint(ge=1, le=2)
     shuffle: bool
-    class_weights: conlist(item_type=Any, min_items=1)
+    class_weights: Optional[conlist(item_type=Any, min_items=1)]
 
 
 def yaml_to_pydantic(path: str, validate: bool) -> TrainingModel:
@@ -34,7 +35,7 @@ def yaml_to_pydantic(path: str, validate: bool) -> TrainingModel:
         raise FileNotFoundError(f"{path} not found")
     if os.path.isfile(path) and path.endswith(".yaml"):
         with open(path, "r") as f:
-            data = yaml.safe_load(f, Loader=yaml.FullLoader)
+            data = yaml.safe_load(f)
         return (
             TrainingModel(**data)
             if validate
diff --git a/yaket/trainer.py b/yaket/trainer.py
index 49d34de..059cb96 100644
--- a/yaket/trainer.py
+++ b/yaket/trainer.py
@@ -3,9 +3,11 @@ from typing import List, Optional, Tuple, Union, Any, Dict, Callable
 import numpy as np
 import tensorflow as tf
 import gc
-from schema.schema import TrainingModel, yaml_to_pydantic
+from yaket.schema.schema import TrainingModel, yaml_to_pydantic
 import importlib
 import mlflow
+import os
+import time
 
 
 @dataclass
@@ -13,11 +15,12 @@ class Trainer:
     config_path: str
     model: tf.keras.Model
     train_dataset: Union[Tuple[np.ndarray, np.ndarray], tf.data.Dataset]
-    dev_dataset: Union[Tuple[np.ndarray, np.ndarray], tf.data.Dataset]
+    val_dataset: Union[Tuple[np.ndarray, np.ndarray], tf.data.Dataset]
     strategy: Optional[tf.distribute.Strategy] = None
     random_seed: int = 1234
     validate_yaml: bool = True
     custom_modules_path: Optional[str] = None
+    
 
     # internals
     _config: TrainingModel = None
@@ -28,6 +31,8 @@ class Trainer:
     _loss: Union[tf.keras.losses.Loss, Callable] = None
     _custom_module: Callable = None
     _history: Dict[str, Any] = None
+    _model_checkpoint: Optional[str] = None
+    
 
     def _init_trainer(self) -> None:
         """Initialize the trainer
@@ -37,16 +42,12 @@ class Trainer:
         if self.custom_modules_path:
             self._import_custom_model(self.custom_modules_path)
         self._callbacks = self._get_callbacks()
-        self._optimizer = self._get_optimizer()
-        self._loss = self._get_loss()
-        self._metrics = self._get_metrics()
         self._input_shape = self._get_input_shape()
 
     def train(self):
         """Train the model. Main function to call
         
         TODO: 
-        1. Add save_model option based on checkpoint 
         2. Add tf.distribute.Strategy
         """
         self._init_trainer()
@@ -59,14 +60,16 @@ class Trainer:
         history = self.model.fit(
             x=x,
             y=y,
-            epochs=int(self.config.training.epochs),
+            epochs=int(self.config.epochs),
             validation_data=val_dataset,
             batch_size=batch_size,
             callbacks=self._callbacks,
-            class_weight=self.class_weight_values,
-            verbose=int(self.config.training.verbose),
+            class_weight=None, #TODO: add class_weight,
+            verbose=int(self.config.verbose),
         )
 
+
+        self._save_model()
         self._history = history.history
         self._clean_workspace()
         return history
@@ -80,7 +83,28 @@ class Trainer:
             self.input_shape = (None, *self.train_dataset[0].shape[1:])
         return self.input_shape
 
+    def _save_model(self):
+        """Save the model by loading best checkpoint if available and saving it to mlflow or local path"""
+        if self._model_checkpoint is not None:
+            self.model.load_weights(self._model_checkpoint)
+            if self._autolog:
+                self.model.save('/tmp/best_model')
+                run = mlflow.last_active_run()
+                idx = 7 #TODO: check is always the same
+                artifact_path = run.info.artifact_uri[idx:]
+                self.model.save(artifact_path+f"/best_model")
+            else:
+                os.makedirs(os.getcwd()+'/models', exist_ok=True)
+                t = int(time.time())
+                self.model.save(os.getcwd()+f"/models/{t}_best_model")
+        else:
+            os.makedirs(os.getcwd()+'/models', exist_ok=True)
+            t = int(time.time())
+            self.model.save(os.getcwd()+f"/models/{t}_best_model")
+
+
     def _get_x_y_val(self):
+        """Get the x and y for training based on the format of the dataset"""
         if self.val_dataset is None:
             return None
         if isinstance(self.val_dataset, tf.data.Dataset):
@@ -96,9 +120,9 @@ class Trainer:
             x = self.train_dataset
         else:
             x = tf.data.Dataset.from_tensor_slices(self.train_dataset)
-            if self.config.training.shuffle:
+            if self.config.shuffle:
                 x = x.shuffle(self.train_dataset[0].shape[0])
-            x = x.batch(self.config.training.batch_size).prefetch(1)
+            x = x.batch(self.config.batch_size).prefetch(1)
         return x, y, batch_size
 
     @property
@@ -123,9 +147,9 @@ class Trainer:
     def _validate_config_file(self):
         "Validate existence of the loss, optimizer and callbacks defined in the config file"
         try:
-            self._get_optimizer()
-            self._get_metrics()
-            self._get_loss()
+            self._optimizer = self._get_optimizer()
+            self._metrics = self._get_metrics()
+            self._loss = self._get_loss()
         except Exception as e:
             raise TypeError(
                 f"You are using a module not defined in either keras or in the custom script\n Details: {e}"
@@ -149,32 +173,45 @@ class Trainer:
             raise ImportError(f"Error importing {module_name}: It does not exist")
 
     def _get_optimizer(self) -> tf.keras.optimizers.Optimizer:
-        opt_pars = self.config.training.optimizer_params
+        """Get the optimizer from the config file"""
+        opt_pars = self.config.optimizer_params
         default_value = "not_found"
         optimizer = getattr(
-            tf.keras.optimizers, f"{self.config.training.optimizer}", default_value
+            tf.keras.optimizers, f"{self.config.optimizer}", default_value
         )
-        if isinstance(optimizer, tf.keras.optimizers.Optimizer):
-            return optimizer(opt_pars)
+        if optimizer != default_value:
+            return optimizer(**opt_pars)
         else:
             return self._load_custom_module(optimizer, opt_pars)
 
     def _get_loss(self) -> Union[tf.keras.losses.Loss, Callable]:
-        loss_name = self.config.training.loss
-        loss = getattr(tf.keras.losses, loss_name, "not_found")
-        if isinstance(loss, tf.keras.losses.Loss):
-            return loss
+        """Get the loss from the config file"""
+
+        loss_name = self.config.loss
+        default_value = "not_found"
+
+        loss = getattr(tf.keras.losses, loss_name, default_value)
+        if loss != default_value:
+            return loss()
         else:  # it's a custom loss
             return self._load_custom_module(loss_name)
 
     def _get_callbacks(self) -> List[tf.keras.callbacks.Callback]:
+        """Get the callbacks from the config file"""
+        if self.config.callbacks is None:
+            return None
         callbacks = []
-        for name_callback in self.config.training.callbacks:
+        default_value = "not_found"
+
+        for name_callback in self.config.callbacks:
             key = list(name_callback.keys())[0]
             args = list(name_callback.values())[0]
 
-            callback_value = getattr(tf.keras.callbacks, key, "not_found")
-            if isinstance(callback_value, tf.keras.callbacks.Callback):
+            # Track filepath if it's a ModelCheckpoint
+            self._model_checkpoint =args['filepath'] if key == 'ModelCheckpoint' else None
+
+            callback_value = getattr(tf.keras.callbacks, key, default_value)
+            if callback_value != default_value:
                 callbacks.append(callback_value(**args))
             else:
                 callbacks.append(self._load_custom_module(key, args))
@@ -183,12 +220,16 @@ class Trainer:
 
     def _get_metrics(self) -> List[Union[tf.keras.metrics.Metric, Callable]]:
         """Get the metrics"""
+        if self.config.metrics is None:
+            return None
+
         list_metrics = []
-        for metric in self.config.training.metrics:
+        default_value = "not_found"
+        for metric in self.config.metrics:
             if metric is None:
                 continue
-            metric_value = getattr(tf.keras.metrics, f"{metric}", "not_found")
-            if isinstance(metric_value, tf.keras.metrics.Metric):
+            metric_value = getattr(tf.keras.metrics, f"{metric}", default_value)
+            if metric_value != default_value:
                 list_metrics.append(metric_value())
             else:
                 list_metrics.append(self._load_custom_module(metric))
@@ -212,15 +253,20 @@ class Trainer:
 
     def _autolog(self) -> None:
         """Autolog the model"""
-        if self.config.training.autolog:
-            mlflow.tensorflow.autolog(log_models=False, disable=False)
+        if self.config.autolog:
+            mlflow.tensorflow.autolog(log_models=True, disable=False)
 
     def _set_randomness(self, random_seed: Optional[int] = None) -> None:
         """Set the randomness"""
         if random_seed is not None:
-            tf.random.set_seed(random_seed)
-            np.random.seed(random_seed)
-
+            if tf.__version__ >= "2.9.0":
+                tf.keras.set_random_seed(random_seed)
+                tf.config.experimental.enable_op_determinism()
+            else:
+                tf.random.set_seed(random_seed)
+                np.random.seed(random_seed)
+                
+                   
     def clear_ram(self):
         "Delete model and all datasets saved in the Trainer class"
         del self.model
@@ -232,3 +278,54 @@ class Trainer:
     def summary_model(self):
         """Summary of the model"""
         self.model.summary()
+
+
+
+if __name__ == '__main__':
+    import numpy as np
+    from tensorflow import keras
+    from tensorflow.keras import layers
+
+    # Model / data parameters
+    num_classes = 10
+    input_shape = (28, 28, 1)
+
+    # Load the data and split it between train and test sets
+    (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
+
+    # Scale images to the [0, 1] range
+    x_train = x_train.astype("float32") / 255
+    x_test = x_test.astype("float32") / 255
+    # Make sure images have shape (28, 28, 1)
+    x_train = np.expand_dims(x_train, -1)
+    x_test = np.expand_dims(x_test, -1)
+    print("x_train shape:", x_train.shape)
+    print(x_train.shape[0], "train samples")
+    print(x_test.shape[0], "test samples")
+
+
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+    y_test = keras.utils.to_categorical(y_test, num_classes)
+
+
+    model = keras.Sequential(
+    [
+        keras.Input(shape=input_shape),
+        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
+        layers.MaxPooling2D(pool_size=(2, 2)),
+        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
+        layers.MaxPooling2D(pool_size=(2, 2)),
+        layers.Flatten(),
+        layers.Dropout(0.5),
+        layers.Dense(num_classes, activation="softmax"),
+    ]
+    )
+
+    model.summary()
+
+
+    path = '/root/project/yaket/examples/files/trainer.yaml'
+    
+    trainer = Trainer(config_path = path, train_dataset=(x_train, y_train), val_dataset=(x_test, y_test), model=model)
+    trainer.train()
\ No newline at end of file
-- 
GitLab


From b147d05f5ff4b84fd9c765657e633ad7ca666684 Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Fri, 15 Jul 2022 17:39:21 +0200
Subject: [PATCH 10/31] Clear output notebook

---
 examples/00-simple-mnist-convnet.ipynb | 102 +++----------------------
 1 file changed, 9 insertions(+), 93 deletions(-)

diff --git a/examples/00-simple-mnist-convnet.ipynb b/examples/00-simple-mnist-convnet.ipynb
index 119fbc3..08d0ff6 100644
--- a/examples/00-simple-mnist-convnet.ipynb
+++ b/examples/00-simple-mnist-convnet.ipynb
@@ -2,19 +2,9 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2022-07-15 17:35:22.701833: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2022-07-15 17:35:22.706537: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
-      "2022-07-15 17:35:22.706559: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import numpy as np\n",
     "from tensorflow import keras\n",
@@ -24,19 +14,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "x_train shape: (60000, 28, 28, 1)\n",
-      "60000 train samples\n",
-      "10000 test samples\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Model / data parameters\n",
     "num_classes = 10\n",
@@ -63,52 +43,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Model: \"sequential\"\n",
-      "_________________________________________________________________\n",
-      " Layer (type)                Output Shape              Param #   \n",
-      "=================================================================\n",
-      " conv2d (Conv2D)             (None, 26, 26, 32)        320       \n",
-      "                                                                 \n",
-      " max_pooling2d (MaxPooling2D  (None, 13, 13, 32)       0         \n",
-      " )                                                               \n",
-      "                                                                 \n",
-      " conv2d_1 (Conv2D)           (None, 11, 11, 64)        18496     \n",
-      "                                                                 \n",
-      " max_pooling2d_1 (MaxPooling  (None, 5, 5, 64)         0         \n",
-      " 2D)                                                             \n",
-      "                                                                 \n",
-      " flatten (Flatten)           (None, 1600)              0         \n",
-      "                                                                 \n",
-      " dropout (Dropout)           (None, 1600)              0         \n",
-      "                                                                 \n",
-      " dense (Dense)               (None, 10)                16010     \n",
-      "                                                                 \n",
-      "=================================================================\n",
-      "Total params: 34,826\n",
-      "Trainable params: 34,826\n",
-      "Non-trainable params: 0\n",
-      "_________________________________________________________________\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2022-07-15 17:35:25.479384: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory\n",
-      "2022-07-15 17:35:25.479497: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)\n",
-      "2022-07-15 17:35:25.479532: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (dc8739c0aa9a): /proc/driver/nvidia/version does not exist\n",
-      "2022-07-15 17:35:25.480060: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA\n",
-      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "model = keras.Sequential(\n",
     "    [\n",
@@ -128,7 +65,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -139,33 +76,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "TypeError",
-     "evalue": "'NoneType' object is not iterable",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[1;32m/root/project/yaket/examples/00-simple-mnist-convnet.ipynb Cell 5\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> <a href='vscode-notebook-cell://attached-container%2B7b22636f6e7461696e65724e616d65223a222f7562756e74752d61614b45227d/root/project/yaket/examples/00-simple-mnist-convnet.ipynb#ch0000010vscode-remote?line=0'>1</a>\u001b[0m trainer\u001b[39m.\u001b[39;49m_init_trainer()\n",
-      "File \u001b[0;32m~/project/yaket/yaket/trainer.py:44\u001b[0m, in \u001b[0;36mTrainer._init_trainer\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     42\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcustom_modules_path:\n\u001b[1;32m     43\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_import_custom_model(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcustom_modules_path)\n\u001b[0;32m---> 44\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_callbacks \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_get_callbacks()\n\u001b[1;32m     45\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_input_shape \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_get_input_shape()\n",
-      "File \u001b[0;32m~/project/yaket/yaket/trainer.py:214\u001b[0m, in \u001b[0;36mTrainer._get_callbacks\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    211\u001b[0m callbacks \u001b[39m=\u001b[39m []\n\u001b[1;32m    212\u001b[0m default_value \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mnot_found\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m--> 214\u001b[0m \u001b[39mfor\u001b[39;00m name_callback \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39mcallbacks:\n\u001b[1;32m    215\u001b[0m     key \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(name_callback\u001b[39m.\u001b[39mkeys())[\u001b[39m0\u001b[39m]\n\u001b[1;32m    216\u001b[0m     args \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(name_callback\u001b[39m.\u001b[39mvalues())[\u001b[39m0\u001b[39m]\n",
-      "\u001b[0;31mTypeError\u001b[0m: 'NoneType' object is not iterable"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "trainer._init_trainer()"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
-- 
GitLab


From 553400cae8917ebc31b04b8ddd2aadefc3bae720 Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Thu, 21 Jul 2022 10:35:16 +0200
Subject: [PATCH 11/31] Add Accelerator enum for selection

---
 .gitignore                  |   1 +
 examples/files/trainer.yaml |   1 +
 yaket/schema/schema.py      |  20 ++++++
 yaket/trainer.py            | 117 +++++++++++++++++++++++++-----------
 4 files changed, 104 insertions(+), 35 deletions(-)

diff --git a/.gitignore b/.gitignore
index 26b3885..a647eb6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ Pipfile
 mlruns/
 .vscode/launch.json
 yaket/__pycache__/
+.vscode/
\ No newline at end of file
diff --git a/examples/files/trainer.yaml b/examples/files/trainer.yaml
index e8abe1e..3daf874 100644
--- a/examples/files/trainer.yaml
+++ b/examples/files/trainer.yaml
@@ -30,3 +30,4 @@ epochs: 15
 shuffle: False
 class_weights: # First value is boolean
     - False
+accelerator: cpu #Make it optional
diff --git a/yaket/schema/schema.py b/yaket/schema/schema.py
index 0b80756..0a03352 100644
--- a/yaket/schema/schema.py
+++ b/yaket/schema/schema.py
@@ -13,6 +13,17 @@ from pydantic import (
     DirectoryPath,
     Field,
 )
+from enum import Enum, auto
+
+
+class Accelerator(Enum):
+    cpu = auto()
+    gpu = auto()
+    mgpu = auto()
+    tpu = auto()
+    @classmethod
+    def list(cls):
+        return list(map(lambda c: c.name, cls))
 
 
 class TrainingModel(BaseModel, extra=Extra.allow):
@@ -28,6 +39,15 @@ class TrainingModel(BaseModel, extra=Extra.allow):
     verbose: conint(ge=1, le=2)
     shuffle: bool
     class_weights: Optional[conlist(item_type=Any, min_items=1)]
+    accelerator: Optional[constr(strict=True)] 
+
+    @validator('accelerator')
+    def accelerator_validator(cls, v):
+        if v is None:
+            return None
+        if v not in Accelerator.list():
+            raise ValueError(f'{v} is not a valid accelerator.\nPlease use: {Accelerator.list()}')
+        return v
 
 
 def yaml_to_pydantic(path: str, validate: bool) -> TrainingModel:
diff --git a/yaket/trainer.py b/yaket/trainer.py
index 059cb96..ad51e41 100644
--- a/yaket/trainer.py
+++ b/yaket/trainer.py
@@ -1,9 +1,11 @@
 from dataclasses import dataclass
+from enum import Enum, auto
+
 from typing import List, Optional, Tuple, Union, Any, Dict, Callable
 import numpy as np
 import tensorflow as tf
 import gc
-from yaket.schema.schema import TrainingModel, yaml_to_pydantic
+from yaket.schema.schema import TrainingModel, yaml_to_pydantic, Accelerator
 import importlib
 import mlflow
 import os
@@ -20,6 +22,7 @@ class Trainer:
     random_seed: int = 1234
     validate_yaml: bool = True
     custom_modules_path: Optional[str] = None
+
     
 
     # internals
@@ -32,41 +35,65 @@ class Trainer:
     _custom_module: Callable = None
     _history: Dict[str, Any] = None
     _model_checkpoint: Optional[str] = None
+    _accelerator: Optional[Accelerator] = None
     
 
     def _init_trainer(self) -> None:
-        """Initialize the trainer
-        TODO: Add checks + exceptions"""
+        """Initialize the trainer"""
+
+        if not isinstance(self.model, tf.keras.models.Model):
+            raise Exception('model must be keras model')
+        if not isinstance(self.config_path,str) or not os.path.isfile(self.config_path):
+            raise Exception('Config path must be a valid file path')
+        if self.strategy is not None and not isinstance(self.strategy, tf.distribute.Strategy):
+            raise Exception("Strategy must be keras strategy object")
+        if not isinstance(self.random_seed, int):
+            raise Exception("Random seed must be an integer")
+        if not isinstance(self.validate_yaml, bool):
+            raise Exception("Validate yaml must be a boolean value")
+        if self.custom_modules_path is not None:
+            if not isinstance(self.custom_modules_path, str) or not os.path.isfile(self.custom_modules_path):
+                raise Exception("Costum modules path must be a valid path string")
+
+        
         self._config = self._parse_config()
+        self._accelerator = Accelerator[self.config.accelerator]
         self._validate_config_file()
         if self.custom_modules_path:
             self._import_custom_model(self.custom_modules_path)
         self._callbacks = self._get_callbacks()
         self._input_shape = self._get_input_shape()
 
+    
+
     def train(self):
         """Train the model. Main function to call
         
         TODO: 
-        2. Add tf.distribute.Strategy
+        1. Clone the model within tf.distributed.strategy()
+        2. Input of GeneratorDatasetOp::Dataset will not be optimize. Check tf.data with strategy
+
         """
         self._init_trainer()
-        self._compile_model()
         self._autolog()
 
         x, y, batch_size = self._get_x_y_train()  # handle the format of the dataset
         val_dataset = self._get_x_y_val()
-
-        history = self.model.fit(
-            x=x,
-            y=y,
-            epochs=int(self.config.epochs),
-            validation_data=val_dataset,
-            batch_size=batch_size,
-            callbacks=self._callbacks,
-            class_weight=None, #TODO: add class_weight,
-            verbose=int(self.config.verbose),
-        )
+        
+        strategy = self._get_strategy()
+        with strategy.scope():
+
+            self._compile_model()
+            history = self.model.fit(
+                x=x,
+                y=y,
+                epochs=int(self.config.epochs),
+                validation_data=val_dataset,
+                batch_size=batch_size,
+                callbacks=self._callbacks,
+                class_weight=None, #TODO: add class_weight,
+                verbose=int(self.config.verbose),
+            )
 
 
         self._save_model()
@@ -131,13 +158,30 @@ class Trainer:
 
     def _compile_model(self) -> None:
         """Compile the model"""
+
+        self._optimizer = self._get_optimizer()
+        self._loss = self._get_loss()
+        self._metrics = self._get_metrics()
+        
         self.model.compile(
             optimizer=self._optimizer, loss=self._loss, metrics=self._get_metrics()
         )
 
     def _get_strategy(self):
         if self.strategy is None:
-            return tf.distribute.MirroredStrategy()
+            #TODO: fix here with accelarator as paramter
+            if self._accelerator is None:
+                return 
+            
+            if self._accelerator is Accelerator.cpu:
+                index = 0 #TODO: take freer gpu (._get_best_gpu())
+                return tf.distribute.OneDeviceStrategy(f"/gpu:{index}")
+            if self._accelerator is Accelerator.gpu or self._accelerator is Accelerator.mgpu:
+                # If GPUs are not available, it will use CPUs
+                return tf.distribute.MirroredStrategy()
+            if self._accelerator is Accelerator.tpu:
+                #TODO: check configuration for tpu strategy
+                return tf.distribute.TPUStrategy()
         else:
             return self.strategy
 
@@ -147,9 +191,9 @@ class Trainer:
     def _validate_config_file(self):
         "Validate existence of the loss, optimizer and callbacks defined in the config file"
         try:
-            self._optimizer = self._get_optimizer()
-            self._metrics = self._get_metrics()
-            self._loss = self._get_loss()
+            self._get_optimizer()
+            self._get_metrics()
+            self._get_loss()
         except Exception as e:
             raise TypeError(
                 f"You are using a module not defined in either keras or in the custom script\n Details: {e}"
@@ -252,7 +296,7 @@ class Trainer:
         tf.keras.backend.clear_session()
 
     def _autolog(self) -> None:
-        """Autolog the model"""
+        """Autolog the model using MLFlow"""
         if self.config.autolog:
             mlflow.tensorflow.autolog(log_models=True, disable=False)
 
@@ -308,24 +352,27 @@ if __name__ == '__main__':
     y_train = keras.utils.to_categorical(y_train, num_classes)
     y_test = keras.utils.to_categorical(y_test, num_classes)
 
-
-    model = keras.Sequential(
-    [
-        keras.Input(shape=input_shape),
-        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
-        layers.MaxPooling2D(pool_size=(2, 2)),
-        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
-        layers.MaxPooling2D(pool_size=(2, 2)),
-        layers.Flatten(),
-        layers.Dropout(0.5),
-        layers.Dense(num_classes, activation="softmax"),
-    ]
-    )
+    strategy = tf.distribute.MirroredStrategy()
+    with strategy.scope():
+        model = keras.Sequential(
+        [
+            keras.Input(shape=input_shape),
+            layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
+            layers.MaxPooling2D(pool_size=(2, 2)),
+            layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
+            layers.MaxPooling2D(pool_size=(2, 2)),
+            layers.Flatten(),
+            layers.Dropout(0.5),
+            layers.Dense(num_classes, activation="softmax"),
+        ]
+        )
 
     model.summary()
 
+    
 
     path = '/root/project/yaket/examples/files/trainer.yaml'
     
-    trainer = Trainer(config_path = path, train_dataset=(x_train, y_train), val_dataset=(x_test, y_test), model=model)
+    trainer = Trainer(config_path = path, train_dataset=(x_train, y_train), val_dataset=(x_test, y_test), \
+            model=model, strategy=strategy)
     trainer.train()
\ No newline at end of file
-- 
GitLab


From 5577397b4e92ac832a603fd62c1073ab57158f5e Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Thu, 21 Jul 2022 11:12:17 +0200
Subject: [PATCH 12/31] Add get_freer_gpu for selection of gpu

---
 .gitignore                  |  3 ++-
 examples/files/trainer.yaml |  4 ++--
 yaket/trainer.py            | 45 ++++++++++++++++++++++++++-----------
 3 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/.gitignore b/.gitignore
index a647eb6..f7d6237 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,5 @@ Pipfile
 mlruns/
 .vscode/launch.json
 yaket/__pycache__/
-.vscode/
\ No newline at end of file
+.vscode/
+models/
diff --git a/examples/files/trainer.yaml b/examples/files/trainer.yaml
index 3daf874..de655b5 100644
--- a/examples/files/trainer.yaml
+++ b/examples/files/trainer.yaml
@@ -25,8 +25,8 @@ loss: CategoricalCrossentropy
 #         save_weights_only: True
 #         filepath: '/tmp/checkpoint'
         
-verbose: 2 # 0, 1, 2 
-epochs: 15
+verbose: 1 # 0, 1, 2 
+epochs: 1
 shuffle: False
 class_weights: # First value is boolean
     - False
diff --git a/yaket/trainer.py b/yaket/trainer.py
index ad51e41..8703253 100644
--- a/yaket/trainer.py
+++ b/yaket/trainer.py
@@ -10,6 +10,8 @@ import importlib
 import mlflow
 import os
 import time
+import subprocess as sp
+
 
 
 @dataclass
@@ -77,19 +79,25 @@ class Trainer:
         self._init_trainer()
         self._autolog()
 
-        x, y, batch_size = self._get_x_y_train()  # handle the format of the dataset
+        train_dataset = self._get_x_y_train()  # handle the format of the dataset
         val_dataset = self._get_x_y_val()
         
         strategy = self._get_strategy()
+
         with strategy.scope():
 
+            self._clone_model()
             self._compile_model()
+
+            train_dataset = strategy.experimental_distribute_dataset(train_dataset)
+            val_dataset = strategy.experimental_distribute_dataset(val_dataset)
+
             history = self.model.fit(
-                x=x,
-                y=y,
+                x=train_dataset,
+                y=None,
                 epochs=int(self.config.epochs),
                 validation_data=val_dataset,
-                batch_size=batch_size,
+                batch_size=None,
                 callbacks=self._callbacks,
                 class_weight=None, #TODO: add class_weight,
                 verbose=int(self.config.verbose),
@@ -128,6 +136,9 @@ class Trainer:
             os.makedirs(os.getcwd()+'/models', exist_ok=True)
             t = int(time.time())
             self.model.save(os.getcwd()+f"/models/{t}_best_model")
+    def _clone_model(self):
+        """Clone the model so that it works within tf.distribute.Strategy"""
+        self.model = tf.keras.models.clone_model(self.model)
 
 
     def _get_x_y_val(self):
@@ -142,7 +153,6 @@ class Trainer:
 
     def _get_x_y_train(self):
         """Get the x and y for training based on the format of the dataset"""
-        y, batch_size = None, None
         if isinstance(self.train_dataset, tf.data.Dataset):
             x = self.train_dataset
         else:
@@ -150,7 +160,7 @@ class Trainer:
             if self.config.shuffle:
                 x = x.shuffle(self.train_dataset[0].shape[0])
             x = x.batch(self.config.batch_size).prefetch(1)
-        return x, y, batch_size
+        return x
 
     @property
     def config(self):
@@ -169,14 +179,12 @@ class Trainer:
 
     def _get_strategy(self):
         if self.strategy is None:
-            #TODO: fix here with accelarator as paramter
-            if self._accelerator is None:
-                return 
-            
-            if self._accelerator is Accelerator.cpu:
-                index = 0 #TODO: take freer gpu (._get_best_gpu())
+            if self._accelerator is None :
+                return tf.distribute.MirroredStrategy()
+            if self._accelerator is Accelerator.gpu:
+                index = Trainer.get_free_gpu_idx()
                 return tf.distribute.OneDeviceStrategy(f"/gpu:{index}")
-            if self._accelerator is Accelerator.gpu or self._accelerator is Accelerator.mgpu:
+            if self._accelerator is Accelerator.cpu or self._accelerator is Accelerator.mgpu:
                 # If GPUs are not available, it will use CPUs
                 return tf.distribute.MirroredStrategy()
             if self._accelerator is Accelerator.tpu:
@@ -184,6 +192,17 @@ class Trainer:
                 return tf.distribute.TPUStrategy()
         else:
             return self.strategy
+    
+    @staticmethod
+    def get_free_gpu_idx():
+        """Get the index of the freer GPU"""
+        command = "nvidia-smi --query-gpu=memory.free --format=csv"
+        memory_free_info = (
+            sp.check_output(command.split()).decode("ascii").split("\n")[:-1][1:]
+        )
+        memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
+        return int(np.argmin(memory_free_values))
+
 
     def _parse_config(self) -> Any:
         return yaml_to_pydantic(self.config_path, self.validate_yaml)
-- 
GitLab


From 0ab243ba58e88368fa6acb560d6090d7a367dd3a Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Thu, 21 Jul 2022 11:45:19 +0200
Subject: [PATCH 13/31] Add distributed training

---
 yaket/trainer.py | 46 +++++++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/yaket/trainer.py b/yaket/trainer.py
index 8703253..654acaf 100644
--- a/yaket/trainer.py
+++ b/yaket/trainer.py
@@ -68,30 +68,29 @@ class Trainer:
 
     
-    def train(self):
-        """Train the model. Main function to call
+    def train(self, epochs: int = None):
+        """Train the model. Main function to call.
+        
+        Args
+        ----
+        epochs: int, optional
+            Number of epochs to train. If None, will train with the number of epochs specified in the config file.
         
-        TODO: 
-        1. Clone the model within tf.distributed.strategy()
-        2. Input of GeneratorDatasetOp::Dataset will not be optimize. Check tf.data with strategy
-
         """
+
         self._init_trainer()
         self._autolog()
 
-        train_dataset = self._get_x_y_train()  # handle the format of the dataset
-        val_dataset = self._get_x_y_val()
-        
         strategy = self._get_strategy()
-
+        batch_size = strategy.num_replicas_in_sync*self.config.batch_size
+        train_dataset = self._get_x_y_train(batch_size) 
+        val_dataset = self._get_x_y_val(batch_size)
+        
         with strategy.scope():
 
-            self._clone_model()
+            if self.strategy is None: self._clone_model() 
             self._compile_model()
 
-            train_dataset = strategy.experimental_distribute_dataset(train_dataset)
-            val_dataset = strategy.experimental_distribute_dataset(val_dataset)
-
             history = self.model.fit(
                 x=train_dataset,
                 y=None,
@@ -137,29 +136,38 @@ class Trainer:
             t = int(time.time())
             self.model.save(os.getcwd()+f"/models/{t}_best_model")
     def _clone_model(self):
-        """Clone the model so that it works within tf.distribute.Strategy"""
+        """Clone the model so that it works within tf.distribute.Strategy
+            It works only for models not using custom objects
+        """
         self.model = tf.keras.models.clone_model(self.model)
 
 
-    def _get_x_y_val(self):
+    def _get_x_y_val(self, batch_size):
         """Get the x and y for training based on the format of the dataset"""
+        options = tf.data.Options()
+        options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+
         if self.val_dataset is None:
             return None
         if isinstance(self.val_dataset, tf.data.Dataset):
             return self.val_dataset
         else:
-            val = tf.data.Dataset.from_tensor_slices(self.val_dataset).batch(1)
+            val = tf.data.Dataset.from_tensor_slices(self.val_dataset)\
+                .batch(batch_size).with_options(options)
             return val
 
-    def _get_x_y_train(self):
+    def _get_x_y_train(self, batch_size):
         """Get the x and y for training based on the format of the dataset"""
+        options = tf.data.Options()
+        options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+    
         if isinstance(self.train_dataset, tf.data.Dataset):
             x = self.train_dataset
         else:
             x = tf.data.Dataset.from_tensor_slices(self.train_dataset)
             if self.config.shuffle:
                 x = x.shuffle(self.train_dataset[0].shape[0])
-            x = x.batch(self.config.batch_size).prefetch(1)
+            x = x.batch(batch_size).prefetch(1).with_options(options)
         return x
 
     @property
-- 
GitLab


From a84b1b83577f148ab530b014375cd83bb5d9839e Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Thu, 21 Jul 2022 12:10:41 +0200
Subject: [PATCH 14/31] Add option to use either dict or str for
 metrics/callbacks

---
 examples/files/trainer.yaml | 47 ++++++++++++++++++-------------------
 yaket/schema/schema.py      | 21 ++++++++---------
 yaket/trainer.py            | 33 ++++++++++++++++++--------
 3 files changed, 56 insertions(+), 45 deletions(-)

diff --git a/examples/files/trainer.yaml b/examples/files/trainer.yaml
index de655b5..3a2dabf 100644
--- a/examples/files/trainer.yaml
+++ b/examples/files/trainer.yaml
@@ -1,33 +1,32 @@
 autolog: False
-optimizer: Adam # SGOptimizer
+# optimizer: Adam # SGOptimizer
 optimizer_params:
     learning_rate: 0.01
-# metrics:
-# - Accuracy
+metrics:
+- CategoricalAccuracy
+- AUC:
+    curve: PR
+    name: prc
 batch_size: 128 
 loss: CategoricalCrossentropy 
-# callbacks:
-#     # - EarlyStopping:
-#     #     monitor: val_loss 
-#     #     patience: 10
-#     #     mode: min
-#     #     verbose: 1
-#     # - ReduceLROnPlateau:
-#     #     monitor: val_loss
-#     #     mode: min
-#     #     patience: 5
-#     #     verbose: 1
-#     - ModelCheckpoint:
-#         monitor: val_accuracy
-#         mode: max
-#         verbose: 1
-#         save_best_only: True
-#         save_weights_only: True
-#         filepath: '/tmp/checkpoint'
+callbacks:
+    - EarlyStopping
+    - ReduceLROnPlateau:
+        monitor: val_loss
+        mode: min
+        patience: 5
+    # #     verbose: 1
+    # - ModelCheckpoint:
+    #     monitor: val_accuracy
+    #     mode: max
+    #     verbose: 1
+    #     save_best_only: True
+    #     save_weights_only: True
+    #     filepath: '/tmp/checkpoint'
         
-verbose: 1 # 0, 1, 2 
-epochs: 1
-shuffle: False
+# verbose: 1 # 0, 1, 2 
+# epochs: 1
+# shuffle: False
 class_weights: # First value is boolean
     - False
 accelerator: cpu #Make it optional
diff --git a/yaket/schema/schema.py b/yaket/schema/schema.py
index 0a03352..0c29322 100644
--- a/yaket/schema/schema.py
+++ b/yaket/schema/schema.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional, Any
+from typing import Dict, Optional, Any, Tuple, Union
 import yaml
 import os
 from pydantic import (
@@ -27,18 +27,17 @@ class Accelerator(Enum):
 
 
 class TrainingModel(BaseModel, extra=Extra.allow):
-    autolog: bool
-    optimizer: constr(strict=True)
+    autolog: bool = False
+    optimizer: constr(strict=True) = 'Adam'
     optimizer_params: Optional[Dict[str, Any]] = None
-    
-    epochs: PositiveInt
-    batch_size: PositiveInt  # if format is numpy
+    epochs: PositiveInt = 1
+    batch_size: PositiveInt = 1  # if format is numpy
     loss: constr(strict=True)
-    callbacks: Optional[conlist(item_type=Dict[str, Any], min_items=0)] 
-    metrics: Optional[conlist(item_type=str, min_items=1, unique_items=True)]
-    verbose: conint(ge=1, le=2)
-    shuffle: bool
-    class_weights: Optional[conlist(item_type=Any, min_items=1)]
+    callbacks: Optional[conlist(item_type=Union[str,Dict[str, Any]], min_items=0)] 
+    metrics: Optional[conlist(item_type=Union[str,Dict], min_items=1, unique_items=True)]
+    verbose: conint(ge=1, le=2) = 1
+    shuffle: bool = True
+    class_weights: Optional[conlist(item_type=Any, min_items=1)] 
     accelerator: Optional[constr(strict=True)] 
 
     @validator('accelerator')
diff --git a/yaket/trainer.py b/yaket/trainer.py
index 654acaf..86455a3 100644
--- a/yaket/trainer.py
+++ b/yaket/trainer.py
@@ -38,6 +38,7 @@ class Trainer:
     _history: Dict[str, Any] = None
     _model_checkpoint: Optional[str] = None
     _accelerator: Optional[Accelerator] = None
+    _log:bool = False
     
 
     def _init_trainer(self) -> None:
@@ -121,7 +122,7 @@ class Trainer:
         """Save the model by loading best checkpoint if available and saving it to mlflow or local path"""
         if self._model_checkpoint is not None:
             self.model.load_weights(self._model_checkpoint)
-            if self._autolog:
+            if self._log:
                 self.model.save('/tmp/best_model')
                 run = mlflow.last_active_run()
                 idx = 7 #TODO: check is always the same
@@ -275,15 +276,20 @@ class Trainer:
         default_value = "not_found"
 
         for name_callback in self.config.callbacks:
-            key = list(name_callback.keys())[0]
-            args = list(name_callback.values())[0]
+            if isinstance(name_callback, Dict):
+                key = list(name_callback.keys())[0]
+                args = list(name_callback.values())[0]
 
-            # Track filepath if it's a ModelCheckpoint
-            self._model_checkpoint =args['filepath'] if key == 'ModelCheckpoint' else None
+                # Track filepath if it's a ModelCheckpoint
+                self._model_checkpoint =args['filepath'] if key == 'ModelCheckpoint' else None
+
+                callback_value = getattr(tf.keras.callbacks, key, default_value)
+            else:
+                callback_value = getattr(tf.keras.callbacks, name_callback, default_value)
+                args = None
 
-            callback_value = getattr(tf.keras.callbacks, key, default_value)
             if callback_value != default_value:
-                callbacks.append(callback_value(**args))
+                callbacks.append(callback_value(**args) if args is not None else callback_value())
             else:
                 callbacks.append(self._load_custom_module(key, args))
 
@@ -299,11 +305,17 @@ class Trainer:
         for metric in self.config.metrics:
             if metric is None:
                 continue
-            metric_value = getattr(tf.keras.metrics, f"{metric}", default_value)
+            if isinstance(metric,str):   
+                args = None
+                metric_value = getattr(tf.keras.metrics, f"{metric}", default_value)()
+            else:
+                m, args = list(metric.items())[0]
+                metric_value = getattr(tf.keras.metrics, f"{m}", default_value)(**args)
+            
             if metric_value != default_value:
-                list_metrics.append(metric_value())
+                list_metrics.append(metric_value)
             else:
-                list_metrics.append(self._load_custom_module(metric))
+                list_metrics.append(self._load_custom_module(metric, args))
         return list_metrics
 
     @staticmethod
@@ -326,6 +338,7 @@ class Trainer:
         """Autolog the model using MLFlow"""
         if self.config.autolog:
             mlflow.tensorflow.autolog(log_models=True, disable=False)
+            self._log = True
 
     def _set_randomness(self, random_seed: Optional[int] = None) -> None:
         """Set the randomness"""
-- 
GitLab


From 86a2b7fb0ceed7d00198538db7568c9b79100bf4 Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Mon, 25 Jul 2022 13:32:58 +0200
Subject: [PATCH 15/31] Add asr example and fix bug callbacks loading

---
 examples/01-asr-ctc.ipynb           | 359 ++++++++++++++++++++++++++++
 examples/files/01_custom_modules.py |  67 ++++++
 examples/files/01_trainer.yaml      |  21 ++
 examples/files/trainer.yaml         |   2 +
 setup.py                            |   2 +-
 yaket/01_custom_modules.py          |  67 ++++++
 yaket/trainer.py                    |  23 +-
 7 files changed, 533 insertions(+), 8 deletions(-)
 create mode 100644 examples/01-asr-ctc.ipynb
 create mode 100644 examples/files/01_custom_modules.py
 create mode 100644 examples/files/01_trainer.yaml
 create mode 100644 yaket/01_custom_modules.py

diff --git a/examples/01-asr-ctc.ipynb b/examples/01-asr-ctc.ipynb
new file mode 100644
index 0000000..3bab115
--- /dev/null
+++ b/examples/01-asr-ctc.ipynb
@@ -0,0 +1,359 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import tensorflow as tf\n",
+    "from tensorflow import keras\n",
+    "from tensorflow.keras import layers\n",
+    "from jiwer import wer\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_url = \"https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2\"\n",
+    "data_path = keras.utils.get_file(\"LJSpeech-1.1\", data_url, untar=True)\n",
+    "wavs_path = data_path + \"/wavs/\"\n",
+    "metadata_path = data_path + \"/metadata.csv\"\n",
+    "\n",
+    "\n",
+    "# Read metadata file and parse it\n",
+    "metadata_df = pd.read_csv(metadata_path, sep=\"|\", header=None, quoting=3)\n",
+    "metadata_df.columns = [\"file_name\", \"transcription\", \"normalized_transcription\"]\n",
+    "metadata_df = metadata_df[[\"file_name\", \"normalized_transcription\"]]\n",
+    "metadata_df = metadata_df.sample(frac=1).reset_index(drop=True)\n",
+    "metadata_df.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The set of characters accepted in the transcription.\n",
+    "characters = [x for x in \"abcdefghijklmnopqrstuvwxyz'?! \"]\n",
+    "# Mapping characters to integers\n",
+    "char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token=\"\")\n",
+    "# Mapping integers back to original characters\n",
+    "num_to_char = keras.layers.StringLookup(\n",
+    "    vocabulary=char_to_num.get_vocabulary(), oov_token=\"\", invert=True\n",
+    ")\n",
+    "\n",
+    "print(\n",
+    "    f\"The vocabulary is: {char_to_num.get_vocabulary()} \"\n",
+    "    f\"(size ={char_to_num.vocabulary_size()})\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# An integer scalar Tensor. The window length in samples.\n",
+    "frame_length = 256\n",
+    "# An integer scalar Tensor. The number of samples to step.\n",
+    "frame_step = 160\n",
+    "# An integer scalar Tensor. The size of the FFT to apply.\n",
+    "# If not provided, uses the smallest power of 2 enclosing frame_length.\n",
+    "fft_length = 384\n",
+    "\n",
+    "\n",
+    "def encode_single_sample(wav_file, label):\n",
+    "    ###########################################\n",
+    "    ##  Process the Audio\n",
+    "    ##########################################\n",
+    "    # 1. Read wav file\n",
+    "    file = tf.io.read_file(wavs_path + wav_file + \".wav\")\n",
+    "    # 2. Decode the wav file\n",
+    "    audio, _ = tf.audio.decode_wav(file)\n",
+    "    audio = tf.squeeze(audio, axis=-1)\n",
+    "    # 3. Change type to float\n",
+    "    audio = tf.cast(audio, tf.float32)\n",
+    "    # 4. Get the spectrogram\n",
+    "    spectrogram = tf.signal.stft(\n",
+    "        audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length\n",
+    "    )\n",
+    "    # 5. We only need the magnitude, which can be derived by applying tf.abs\n",
+    "    spectrogram = tf.abs(spectrogram)\n",
+    "    spectrogram = tf.math.pow(spectrogram, 0.5)\n",
+    "    # 6. normalisation\n",
+    "    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)\n",
+    "    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)\n",
+    "    spectrogram = (spectrogram - means) / (stddevs + 1e-10)\n",
+    "    ###########################################\n",
+    "    ##  Process the label\n",
+    "    ##########################################\n",
+    "    # 7. Convert label to Lower case\n",
+    "    label = tf.strings.lower(label)\n",
+    "    # 8. Split the label\n",
+    "    label = tf.strings.unicode_split(label, input_encoding=\"UTF-8\")\n",
+    "    # 9. Map the characters in label to numbers\n",
+    "    label = char_to_num(label)\n",
+    "    # 10. Return a dict as our model is expecting two inputs\n",
+    "    return spectrogram, label"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "split = int(len(metadata_df) * 0.90)\n",
+    "df_train = metadata_df[:split][:5]\n",
+    "df_val = metadata_df[split:][:2]\n",
+    "\n",
+    "print(f\"Size of the training set: {len(df_train)}\")\n",
+    "print(f\"Size of the training set: {len(df_val)}\")\n",
+    "\n",
+    "batch_size = 2\n",
+    "# Define the trainig dataset\n",
+    "train_dataset = tf.data.Dataset.from_tensor_slices(\n",
+    "    (list(df_train[\"file_name\"]), list(df_train[\"normalized_transcription\"]))\n",
+    ")\n",
+    "train_dataset = (\n",
+    "    train_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)\n",
+    "    .padded_batch(batch_size)\n",
+    "    .prefetch(buffer_size=tf.data.AUTOTUNE)\n",
+    ")\n",
+    "\n",
+    "# Define the validation dataset\n",
+    "validation_dataset = tf.data.Dataset.from_tensor_slices(\n",
+    "    (list(df_val[\"file_name\"]), list(df_val[\"normalized_transcription\"]))\n",
+    ")\n",
+    "validation_dataset = (\n",
+    "    validation_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)\n",
+    "    .padded_batch(batch_size)\n",
+    "    .prefetch(buffer_size=tf.data.AUTOTUNE)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def CTCLoss(y_true, y_pred):\n",
+    "    # Compute the training-time loss value\n",
+    "    batch_len = tf.cast(tf.shape(y_true)[0], dtype=\"int64\")\n",
+    "    input_length = tf.cast(tf.shape(y_pred)[1], dtype=\"int64\")\n",
+    "    label_length = tf.cast(tf.shape(y_true)[1], dtype=\"int64\")\n",
+    "\n",
+    "    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype=\"int64\")\n",
+    "    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype=\"int64\")\n",
+    "\n",
+    "    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)\n",
+    "    return loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):\n",
+    "    \"\"\"Model similar to DeepSpeech2.\"\"\"\n",
+    "    # Model's input\n",
+    "    input_spectrogram = layers.Input((None, input_dim), name=\"input\")\n",
+    "    # Expand the dimension to use 2D CNN.\n",
+    "    x = layers.Reshape((-1, input_dim, 1), name=\"expand_dim\")(input_spectrogram)\n",
+    "    # Convolution layer 1\n",
+    "    x = layers.Conv2D(\n",
+    "        filters=32,\n",
+    "        kernel_size=[11, 41],\n",
+    "        strides=[2, 2],\n",
+    "        padding=\"same\",\n",
+    "        use_bias=False,\n",
+    "        name=\"conv_1\",\n",
+    "    )(x)\n",
+    "    x = layers.BatchNormalization(name=\"conv_1_bn\")(x)\n",
+    "    x = layers.ReLU(name=\"conv_1_relu\")(x)\n",
+    "    # Convolution layer 2\n",
+    "    x = layers.Conv2D(\n",
+    "        filters=32,\n",
+    "        kernel_size=[11, 21],\n",
+    "        strides=[1, 2],\n",
+    "        padding=\"same\",\n",
+    "        use_bias=False,\n",
+    "        name=\"conv_2\",\n",
+    "    )(x)\n",
+    "    x = layers.BatchNormalization(name=\"conv_2_bn\")(x)\n",
+    "    x = layers.ReLU(name=\"conv_2_relu\")(x)\n",
+    "    # Reshape the resulted volume to feed the RNNs layers\n",
+    "    x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)\n",
+    "    # RNN layers\n",
+    "    for i in range(1, rnn_layers + 1):\n",
+    "        recurrent = layers.GRU(\n",
+    "            units=rnn_units,\n",
+    "            activation=\"tanh\",\n",
+    "            recurrent_activation=\"sigmoid\",\n",
+    "            use_bias=True,\n",
+    "            return_sequences=True,\n",
+    "            reset_after=True,\n",
+    "            name=f\"gru_{i}\",\n",
+    "        )\n",
+    "        x = layers.Bidirectional(\n",
+    "            recurrent, name=f\"bidirectional_{i}\", merge_mode=\"concat\"\n",
+    "        )(x)\n",
+    "        if i < rnn_layers:\n",
+    "            x = layers.Dropout(rate=0.5)(x)\n",
+    "    # Dense layer\n",
+    "    x = layers.Dense(units=rnn_units * 2, name=\"dense_1\")(x)\n",
+    "    x = layers.ReLU(name=\"dense_1_relu\")(x)\n",
+    "    x = layers.Dropout(rate=0.5)(x)\n",
+    "    # Classification layer\n",
+    "    output = layers.Dense(units=output_dim + 1, activation=\"softmax\")(x)\n",
+    "    # Model\n",
+    "    model = keras.Model(input_spectrogram, output, name=\"DeepSpeech_2\")\n",
+    "    # Optimizer\n",
+    "    opt = keras.optimizers.Adam(learning_rate=1e-4)\n",
+    "    # Compile the model and return\n",
+    "    model.compile(optimizer=opt, loss=CTCLoss)\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "# Get the model\n",
+    "model = build_model(\n",
+    "    input_dim=fft_length // 2 + 1,\n",
+    "    output_dim=char_to_num.vocabulary_size(),\n",
+    "    rnn_units=512,\n",
+    ")\n",
+    "#model.summary(line_length=110)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def decode_batch_predictions(pred):\n",
+    "    input_len = np.ones(pred.shape[0]) * pred.shape[1]\n",
+    "    # Use greedy search. For complex tasks, you can use beam search\n",
+    "    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]\n",
+    "    # Iterate over the results and get back the text\n",
+    "    output_text = []\n",
+    "    for result in results:\n",
+    "        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode(\"utf-8\")\n",
+    "        output_text.append(result)\n",
+    "    return output_text\n",
+    "\n",
+    "\n",
+    "# A callback class to output a few transcriptions during training\n",
+    "class CallbackEval(keras.callbacks.Callback):\n",
+    "    \"\"\"Displays a batch of outputs after every epoch.\"\"\"\n",
+    "\n",
+    "    def __init__(self, dataset):\n",
+    "        super().__init__()\n",
+    "        self.dataset = dataset\n",
+    "\n",
+    "    def on_epoch_end(self, epoch: int, logs=None):\n",
+    "        predictions = []\n",
+    "        targets = []\n",
+    "        for batch in self.dataset:\n",
+    "            X, y = batch\n",
+    "            batch_predictions = model.predict(X)\n",
+    "            batch_predictions = decode_batch_predictions(batch_predictions)\n",
+    "            predictions.extend(batch_predictions)\n",
+    "            for label in y:\n",
+    "                label = (\n",
+    "                    tf.strings.reduce_join(num_to_char(label)).numpy().decode(\"utf-8\")\n",
+    "                )\n",
+    "                targets.append(label)\n",
+    "        wer_score = wer(targets, predictions)\n",
+    "        print(\"-\" * 100)\n",
+    "        print(f\"Word Error Rate: {wer_score:.4f}\")\n",
+    "        print(\"-\" * 100)\n",
+    "        for i in np.random.randint(0, len(predictions), 2):\n",
+    "            print(f\"Target    : {targets[i]}\")\n",
+    "            print(f\"Prediction: {predictions[i]}\")\n",
+    "            print(\"-\" * 100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Let's check results on more validation samples\n",
+    "predictions = []\n",
+    "targets = []\n",
+    "for batch in validation_dataset:\n",
+    "    X, y = batch\n",
+    "    batch_predictions = model.predict(X)\n",
+    "    batch_predictions = decode_batch_predictions(batch_predictions)\n",
+    "    predictions.extend(batch_predictions)\n",
+    "    for label in y:\n",
+    "        label = tf.strings.reduce_join(num_to_char(label)).numpy().decode(\"utf-8\")\n",
+    "        targets.append(label)\n",
+    "wer_score = wer(targets, predictions)\n",
+    "print(\"-\" * 100)\n",
+    "print(f\"Word Error Rate: {wer_score:.4f}\")\n",
+    "print(\"-\" * 100)\n",
+    "for i in np.random.randint(0, len(predictions), 5):\n",
+    "    print(f\"Target    : {targets[i]}\")\n",
+    "    print(f\"Prediction: {predictions[i]}\")\n",
+    "    print(\"-\" * 100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from yaket.trainer import Trainer\n",
+    "\n",
+    "model.summary()\n",
+    "path = '/root/project/yaket/examples/files/01_trainer.yaml'\n",
+    "custom_path = '/root/project/yaket/examples/files/01_custom_modules.py'\n",
+    "trainer = Trainer(path, model = model, train_dataset = train_dataset, val_dataset = validation_dataset,\n",
+    "                    custom_modules_path=custom_path)\n",
+    "trainer.train(1)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.4 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/files/01_custom_modules.py b/examples/files/01_custom_modules.py
new file mode 100644
index 0000000..bcfb9cc
--- /dev/null
+++ b/examples/files/01_custom_modules.py
@@ -0,0 +1,67 @@
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from jiwer import wer
+
+# The set of characters accepted in the transcription.
+characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
+# Mapping characters to integers
+char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
+# Mapping integers back to original characters
+num_to_char = keras.layers.StringLookup(
+    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
+)
+
+def decode_batch_predictions(pred):
+    input_len = np.ones(pred.shape[0]) * pred.shape[1]
+    # Use greedy search. For complex tasks, you can use beam search
+    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
+    # Iterate over the results and get back the text
+    output_text = []
+    for result in results:
+        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
+        output_text.append(result)
+    return output_text
+
+
+# A callback class to output a few transcriptions during training
+class CallbackEval(keras.callbacks.Callback):
+    """Displays a batch of outputs after every epoch."""
+
+    def __init__(self, dataset):
+        super().__init__()
+        self.dataset = dataset
+
+    def on_epoch_end(self, epoch: int, logs=None):
+        predictions = []
+        targets = []
+        for batch in self.dataset:
+            X, y = batch
+            batch_predictions = self.model.predict(X)
+            batch_predictions = decode_batch_predictions(batch_predictions)
+            predictions.extend(batch_predictions)
+            for label in y:
+                label = (
+                    tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
+                )
+                targets.append(label)
+        wer_score = wer(targets, predictions)
+        print("-" * 100)
+        print(f"Word Error Rate: {wer_score:.4f}")
+        print("-" * 100)
+        for i in np.random.randint(0, len(predictions), 2):
+            print(f"Target    : {targets[i]}")
+            print(f"Prediction: {predictions[i]}")
+            print("-" * 100)
+
+def CTCLoss(y_true, y_pred):
+    # Compute the training-time loss value
+    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
+    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
+    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
+
+    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
+    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
+
+    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
+    return loss
diff --git a/examples/files/01_trainer.yaml b/examples/files/01_trainer.yaml
new file mode 100644
index 0000000..9bd3754
--- /dev/null
+++ b/examples/files/01_trainer.yaml
@@ -0,0 +1,21 @@
+autolog: False
+# optimizer: Adam # SGOptimizer
+optimizer_params:
+    learning_rate: 0.01
+batch_size: 128 
+loss: CTCLoss 
+callbacks:
+    - EarlyStopping
+    - ReduceLROnPlateau:
+        monitor: val_loss
+        mode: min
+        patience: 5
+    - CallbackEval:
+        dataset: valdidation # custom callbacks that require dataset you need this key:value pair
+    
+verbose: 1 # 0, 1, 2 
+epochs: 1
+# shuffle: False
+class_weights: # First value is boolean
+    - False
+accelerator: cpu #Make it optional
diff --git a/examples/files/trainer.yaml b/examples/files/trainer.yaml
index 3a2dabf..6c831e6 100644
--- a/examples/files/trainer.yaml
+++ b/examples/files/trainer.yaml
@@ -15,6 +15,8 @@ callbacks:
         monitor: val_loss
         mode: min
         patience: 5
+    - CallbackEval:
+        dataset: valdidation
     # #     verbose: 1
     # - ModelCheckpoint:
     #     monitor: val_accuracy
diff --git a/setup.py b/setup.py
index 5601580..351ab08 100644
--- a/setup.py
+++ b/setup.py
@@ -9,5 +9,5 @@ setup(name = 'yaket',
       packages = find_packages(include = ['yaket', 'yaket.*']),
       setup_requires = ['flake8'],
       install_requires=['pydantic','pyyaml','mlflow'],
-      extras_require = {"tensorflow": ["tensorflow>=2.4"]},
+      extras_require = {"tensorflow": ["tensorflow>=2.4"], "jiwer": ["jiwer"]},
       )
diff --git a/yaket/01_custom_modules.py b/yaket/01_custom_modules.py
new file mode 100644
index 0000000..bcfb9cc
--- /dev/null
+++ b/yaket/01_custom_modules.py
@@ -0,0 +1,67 @@
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from jiwer import wer
+
+# The set of characters accepted in the transcription.
+characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
+# Mapping characters to integers
+char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
+# Mapping integers back to original characters
+num_to_char = keras.layers.StringLookup(
+    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
+)
+
+def decode_batch_predictions(pred):
+    input_len = np.ones(pred.shape[0]) * pred.shape[1]
+    # Use greedy search. For complex tasks, you can use beam search
+    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
+    # Iterate over the results and get back the text
+    output_text = []
+    for result in results:
+        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
+        output_text.append(result)
+    return output_text
+
+
+# A callback class to output a few transcriptions during training
+class CallbackEval(keras.callbacks.Callback):
+    """Displays a batch of outputs after every epoch."""
+
+    def __init__(self, dataset):
+        super().__init__()
+        self.dataset = dataset
+
+    def on_epoch_end(self, epoch: int, logs=None):
+        predictions = []
+        targets = []
+        for batch in self.dataset:
+            X, y = batch
+            batch_predictions = self.model.predict(X)
+            batch_predictions = decode_batch_predictions(batch_predictions)
+            predictions.extend(batch_predictions)
+            for label in y:
+                label = (
+                    tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
+                )
+                targets.append(label)
+        wer_score = wer(targets, predictions)
+        print("-" * 100)
+        print(f"Word Error Rate: {wer_score:.4f}")
+        print("-" * 100)
+        for i in np.random.randint(0, len(predictions), 2):
+            print(f"Target    : {targets[i]}")
+            print(f"Prediction: {predictions[i]}")
+            print("-" * 100)
+
+def CTCLoss(y_true, y_pred):
+    # Compute the training-time loss value
+    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
+    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
+    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
+
+    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
+    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
+
+    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
+    return loss
diff --git a/yaket/trainer.py b/yaket/trainer.py
index 86455a3..6f89951 100644
--- a/yaket/trainer.py
+++ b/yaket/trainer.py
@@ -11,7 +11,7 @@ import mlflow
 import os
 import time
 import subprocess as sp
-
+import sys
 
 
 @dataclass
@@ -58,12 +58,12 @@ class Trainer:
             if not isinstance(self.custom_modules_path, str) or not os.path.isfile(self.custom_modules_path):
                 raise Exception("Costum modules path must be a valid path string")
 
-        
+        if self.custom_modules_path:
+            self._import_custom_model(self.custom_modules_path)
         self._config = self._parse_config()
         self._accelerator = Accelerator[self.config.accelerator]
         self._validate_config_file()
-        if self.custom_modules_path:
-            self._import_custom_model(self.custom_modules_path)
+
         self._callbacks = self._get_callbacks()
         self._input_shape = self._get_input_shape()
 
@@ -95,7 +95,7 @@ class Trainer:
             history = self.model.fit(
                 x=train_dataset,
                 y=None,
-                epochs=int(self.config.epochs),
+                epochs=int(self.config.epochs) if epochs is None else epochs,
                 validation_data=val_dataset,
                 batch_size=None,
                 callbacks=self._callbacks,
@@ -229,7 +229,10 @@ class Trainer:
 
     def _import_custom_model(self, module_name: str):
         try:
-            self._custom_module = importlib.import_module(module_name)
+            custom_dirpath = os.path.dirname(module_name)
+            sys.path.append(custom_dirpath)
+            module = module_name.split('/')[-1].split('.')[0]
+            self._custom_module = importlib.import_module(module)
         except Exception as e:
             raise ImportError(f"Error importing {module_name}: {e}")
 
@@ -291,7 +294,13 @@ class Trainer:
             if callback_value != default_value:
                 callbacks.append(callback_value(**args) if args is not None else callback_value())
             else:
-                callbacks.append(self._load_custom_module(key, args))
+                if args is not None:
+                    if args.get('dataset') is not None:
+                        dataset = self.val_dataset if args['dataset'] in ['val','dev','validation'] else self.train_dataset
+                        args['dataset'] = dataset
+                    callbacks.append(self._load_custom_module(key, args))
+                else:
+                    callbacks.append(self._load_custom_module(name_callback, args))
 
         return callbacks
 
-- 
GitLab


From 5728e4dea15006c3d09eadc164fc886a6666d0d8 Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Mon, 25 Jul 2022 14:20:00 +0200
Subject: [PATCH 16/31] Refactor code

---
 examples/files/01_trainer.yaml | 3 +--
 yaket/trainer.py               | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/files/01_trainer.yaml b/examples/files/01_trainer.yaml
index 9bd3754..07b97da 100644
--- a/examples/files/01_trainer.yaml
+++ b/examples/files/01_trainer.yaml
@@ -11,8 +11,7 @@ callbacks:
         mode: min
         patience: 5
     - CallbackEval:
-        dataset: valdidation # custom callbacks that require dataset you need this key:value pair
-    
+        dataset: valdidation # custom callbacks that require dataset you need this key:value pair    
 verbose: 1 # 0, 1, 2 
 epochs: 1
 # shuffle: False
diff --git a/yaket/trainer.py b/yaket/trainer.py
index 6f89951..248afd5 100644
--- a/yaket/trainer.py
+++ b/yaket/trainer.py
@@ -25,7 +25,6 @@ class Trainer:
     validate_yaml: bool = True
     custom_modules_path: Optional[str] = None
 
-    
 
     # internals
     _config: TrainingModel = None
@@ -136,6 +135,7 @@ class Trainer:
             os.makedirs(os.getcwd()+'/models', exist_ok=True)
             t = int(time.time())
             self.model.save(os.getcwd()+f"/models/{t}_best_model")
+    
     def _clone_model(self):
         """Clone the model so that it works within tf.distribute.Strategy
             It works only for models not using custom objects
-- 
GitLab


From 3f469faff763f367f26a371f1d057df0543bc0aa Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Mon, 25 Jul 2022 17:15:18 +0200
Subject: [PATCH 17/31] Add Converter class

---
 .gitignore                             |   4 +
 examples/00-simple-mnist-convnet.ipynb |  75 ++++++++-
 examples/files/trainer.yaml            |   2 -
 setup.py                               |   5 +-
 yaket/01_custom_modules.py             |  67 ---------
 yaket/converter/__init__.py            |   0
 yaket/converter/converter.py           |  83 ++++++++++
 yaket/trainer.py                       | 201 ++++++++++++++++---------
 8 files changed, 291 insertions(+), 146 deletions(-)
 delete mode 100644 yaket/01_custom_modules.py
 create mode 100644 yaket/converter/__init__.py
 create mode 100644 yaket/converter/converter.py

diff --git a/.gitignore b/.gitignore
index f7d6237..d2124ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,7 @@ mlruns/
 yaket/__pycache__/
 .vscode/
 models/
+yaket/converter/__pycache__/
+*.onnx
+*.pb
+*.tflite
diff --git a/examples/00-simple-mnist-convnet.ipynb b/examples/00-simple-mnist-convnet.ipynb
index 08d0ff6..207c5da 100644
--- a/examples/00-simple-mnist-convnet.ipynb
+++ b/examples/00-simple-mnist-convnet.ipynb
@@ -2,9 +2,19 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-07-25 16:43:44.205743: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2022-07-25 16:43:44.212751: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
+      "2022-07-25 16:43:44.212796: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
+     ]
+    }
+   ],
    "source": [
     "import numpy as np\n",
     "from tensorflow import keras\n",
@@ -14,9 +24,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "x_train shape: (60000, 28, 28, 1)\n",
+      "60000 train samples\n",
+      "10000 test samples\n"
+     ]
+    }
+   ],
    "source": [
     "# Model / data parameters\n",
     "num_classes = 10\n",
@@ -43,9 +63,52 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model: \"sequential\"\n",
+      "_________________________________________________________________\n",
+      " Layer (type)                Output Shape              Param #   \n",
+      "=================================================================\n",
+      " conv2d (Conv2D)             (None, 26, 26, 32)        320       \n",
+      "                                                                 \n",
+      " max_pooling2d (MaxPooling2D  (None, 13, 13, 32)       0         \n",
+      " )                                                               \n",
+      "                                                                 \n",
+      " conv2d_1 (Conv2D)           (None, 11, 11, 64)        18496     \n",
+      "                                                                 \n",
+      " max_pooling2d_1 (MaxPooling  (None, 5, 5, 64)         0         \n",
+      " 2D)                                                             \n",
+      "                                                                 \n",
+      " flatten (Flatten)           (None, 1600)              0         \n",
+      "                                                                 \n",
+      " dropout (Dropout)           (None, 1600)              0         \n",
+      "                                                                 \n",
+      " dense (Dense)               (None, 10)                16010     \n",
+      "                                                                 \n",
+      "=================================================================\n",
+      "Total params: 34,826\n",
+      "Trainable params: 34,826\n",
+      "Non-trainable params: 0\n",
+      "_________________________________________________________________\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-07-25 16:43:47.963880: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory\n",
+      "2022-07-25 16:43:47.964009: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)\n",
+      "2022-07-25 16:43:47.964048: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (dc8739c0aa9a): /proc/driver/nvidia/version does not exist\n",
+      "2022-07-25 16:43:47.964632: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA\n",
+      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    }
+   ],
    "source": [
     "model = keras.Sequential(\n",
     "    [\n",
diff --git a/examples/files/trainer.yaml b/examples/files/trainer.yaml
index 6c831e6..3a2dabf 100644
--- a/examples/files/trainer.yaml
+++ b/examples/files/trainer.yaml
@@ -15,8 +15,6 @@ callbacks:
         monitor: val_loss
         mode: min
         patience: 5
-    - CallbackEval:
-        dataset: valdidation
     # #     verbose: 1
     # - ModelCheckpoint:
     #     monitor: val_accuracy
diff --git a/setup.py b/setup.py
index 351ab08..388beef 100644
--- a/setup.py
+++ b/setup.py
@@ -8,6 +8,7 @@ setup(name = 'yaket',
       url = '',
       packages = find_packages(include = ['yaket', 'yaket.*']),
       setup_requires = ['flake8'],
-      install_requires=['pydantic','pyyaml','mlflow'],
-      extras_require = {"tensorflow": ["tensorflow>=2.4"], "jiwer": ["jiwer"]},
+      install_requires=['pydantic','pyyaml','mlflow','tf2onnx'],
+      extras_require = {"tensorflow": ["tensorflow>=2.4"], "jiwer": ["jiwer"],
+      "onnx_runtime": ["onnxruntime"]},
       )
diff --git a/yaket/01_custom_modules.py b/yaket/01_custom_modules.py
deleted file mode 100644
index bcfb9cc..0000000
--- a/yaket/01_custom_modules.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import numpy as np
-import tensorflow as tf
-from tensorflow import keras
-from jiwer import wer
-
-# The set of characters accepted in the transcription.
-characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
-# Mapping characters to integers
-char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
-# Mapping integers back to original characters
-num_to_char = keras.layers.StringLookup(
-    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
-)
-
-def decode_batch_predictions(pred):
-    input_len = np.ones(pred.shape[0]) * pred.shape[1]
-    # Use greedy search. For complex tasks, you can use beam search
-    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
-    # Iterate over the results and get back the text
-    output_text = []
-    for result in results:
-        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
-        output_text.append(result)
-    return output_text
-
-
-# A callback class to output a few transcriptions during training
-class CallbackEval(keras.callbacks.Callback):
-    """Displays a batch of outputs after every epoch."""
-
-    def __init__(self, dataset):
-        super().__init__()
-        self.dataset = dataset
-
-    def on_epoch_end(self, epoch: int, logs=None):
-        predictions = []
-        targets = []
-        for batch in self.dataset:
-            X, y = batch
-            batch_predictions = self.model.predict(X)
-            batch_predictions = decode_batch_predictions(batch_predictions)
-            predictions.extend(batch_predictions)
-            for label in y:
-                label = (
-                    tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
-                )
-                targets.append(label)
-        wer_score = wer(targets, predictions)
-        print("-" * 100)
-        print(f"Word Error Rate: {wer_score:.4f}")
-        print("-" * 100)
-        for i in np.random.randint(0, len(predictions), 2):
-            print(f"Target    : {targets[i]}")
-            print(f"Prediction: {predictions[i]}")
-            print("-" * 100)
-
-def CTCLoss(y_true, y_pred):
-    # Compute the training-time loss value
-    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
-    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
-    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
-
-    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
-    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
-
-    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
-    return loss
diff --git a/yaket/converter/__init__.py b/yaket/converter/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/yaket/converter/converter.py b/yaket/converter/converter.py
new file mode 100644
index 0000000..d991c04
--- /dev/null
+++ b/yaket/converter/converter.py
@@ -0,0 +1,83 @@
+import tf2onnx
+import tensorflow as tf
+from typing import List, Optional, Tuple, Union, Any, Dict, Callable
+from dataclasses import dataclass
+from pathlib import Path
+import subprocess
+import sys
+
+
+@dataclass
+class Converter:
+    out_format: Union[str, Path]
+    model: Optional[tf.keras.Model] = None
+    model_path: Optional[Union[str, Path]] = None
+    out_path: Union[str, Path] = "model.onnx" # model.tflite
+    opset_onnx: int = 15
+
+    def _init_converter(self) -> None:
+        if self.out_format not in ["onnx", "tf-lite"]:
+            raise ValueError(f"Unknown output format: {self.out_format}")
+        if self.model is not None and not isinstance(self.model, tf.keras.Model):
+            raise ValueError(f"Model must be a tf.keras.Model, got {type(self.model)}")
+        if not isinstance(self.out_path, str) or not isinstance(self.out_path, Path):
+            raise ValueError(
+                f"Output path must be a string or Path type, got {type(self.out_path)}"
+            )
+
+    def convert(self) -> bool:
+        if self.out_format == "onnx":
+            self._convert_to_onnx()
+            return True
+        elif self.out_format == "tflite":
+            self._convert_to_tflite()
+            return True
+        else:
+            raise ValueError(f"Unknown output format: {self.out_format}")
+
+    def _convert_to_onnx(self):
+        "Function to convert a keras model to onnx using command line"
+        # submit command to command line
+
+        
+        if not self.out_path.endswith(".onnx"):
+            self.out_path = self.out_path + ".onnx"
+        try:
+            if self.model is None:
+                opset_string = (
+                    f"--opset {self.opset_onnx}" if self.opset_onnx is not None else ""
+                )
+                python_version = 3 if sys.version_info.major == 3 else ""
+                command = f"python{python_version} -m tf2onnx.convert {opset_string} --saved-model {self.model_path} --output {self.out_path}"
+                print(command)
+                subprocess.run(command.split(), shell=False)
+            else:
+                specs = [
+                    tf.TensorSpec(
+                        input_model.shape, input_model.dtype, name=f"input_{i}"
+                    )
+                    for i, input_model in enumerate(self.model.inputs)
+                ]
+                model_proto, _ = tf2onnx.convert.from_keras(
+                    self.model, input_signature=specs, output_path=self.out_path
+                )
+                output_names = [n.name for n in model_proto.graph.output]
+                print("Specs:", specs)
+                print("Output names:", output_names)
+        except Exception as e:
+            raise e
+
+    def _convert_to_tflite(self):
+
+        if self.model is None:
+            tf_converter = tf.lite.TFLiteConverter.from_saved_model(self.model_path) # path to the SavedModel directory
+            tflite_model = tf_converter.convert()
+        else:
+            tf_converter = tf.lite.TFLiteConverter.from_keras_model(self.model)
+            tflite_model = tf_converter.convert()
+
+        if not self.out_path.endswith(".tflite"):
+            self.out_path = self.out_path + ".tflite"
+        # save model
+        with open(self.out_path, 'wb') as f:
+            f.write(tflite_model)
diff --git a/yaket/trainer.py b/yaket/trainer.py
index 248afd5..4d713de 100644
--- a/yaket/trainer.py
+++ b/yaket/trainer.py
@@ -6,6 +6,7 @@ import numpy as np
 import tensorflow as tf
 import gc
 from yaket.schema.schema import TrainingModel, yaml_to_pydantic, Accelerator
+from yaket.converter.converter import Converter
 import importlib
 import mlflow
 import os
@@ -25,7 +26,6 @@ class Trainer:
     validate_yaml: bool = True
     custom_modules_path: Optional[str] = None
 
-
     # internals
     _config: TrainingModel = None
     _input_shape: Tuple[int, ...] = None
@@ -37,24 +37,30 @@ class Trainer:
     _history: Dict[str, Any] = None
     _model_checkpoint: Optional[str] = None
     _accelerator: Optional[Accelerator] = None
-    _log:bool = False
-    
+    _log: bool = False
+    _out_path: str = None
 
     def _init_trainer(self) -> None:
         """Initialize the trainer"""
 
         if not isinstance(self.model, tf.keras.models.Model):
-            raise Exception('model must be keras model')
-        if not isinstance(self.config_path,str) or not os.path.isfile(self.config_path):
-            raise Exception('Config path must be a valid file path')
-        if self.strategy is not None and not isinstance(self.strategy, tf.distribute.Strategy):
+            raise Exception("model must be keras model")
+        if not isinstance(self.config_path, str) or not os.path.isfile(
+            self.config_path
+        ):
+            raise Exception("Config path must be a valid file path")
+        if self.strategy is not None and not isinstance(
+            self.strategy, tf.distribute.Strategy
+        ):
             raise Exception("Strategy must be keras strategy object")
         if not isinstance(self.random_seed, int):
             raise Exception("Random seed must be an integer")
         if not isinstance(self.validate_yaml, bool):
             raise Exception("Validate yaml must be a boolean value")
         if self.custom_modules_path is not None:
-            if not isinstance(self.custom_modules_path, str) or not os.path.isfile(self.custom_modules_path):
+            if not isinstance(self.custom_modules_path, str) or not os.path.isfile(
+                self.custom_modules_path
+            ):
                 raise Exception("Costum modules path must be a valid path string")
 
         if self.custom_modules_path:
@@ -66,29 +72,28 @@ class Trainer:
         self._callbacks = self._get_callbacks()
         self._input_shape = self._get_input_shape()
 
-    
-
     def train(self, epochs: int = None):
         """Train the model. Main function to call.
-        
+
         Args
         ----
         epochs: int, optional
             Number of epochs to train. If None, will train with the number of epochs specified in the config file.
-        
+
         """
 
         self._init_trainer()
         self._autolog()
 
         strategy = self._get_strategy()
-        batch_size = strategy.num_replicas_in_sync*self.config.batch_size
-        train_dataset = self._get_x_y_train(batch_size) 
+        batch_size = strategy.num_replicas_in_sync * self.config.batch_size
+        train_dataset = self._get_x_y_train(batch_size)
         val_dataset = self._get_x_y_val(batch_size)
-        
+
         with strategy.scope():
 
-            if self.strategy is None: self._clone_model() 
+            if self.strategy is None:
+                self._clone_model()
             self._compile_model()
 
             history = self.model.fit(
@@ -98,16 +103,53 @@ class Trainer:
                 validation_data=val_dataset,
                 batch_size=None,
                 callbacks=self._callbacks,
-                class_weight=None, #TODO: add class_weight,
+                class_weight=None,  # TODO: add class_weight,
                 verbose=int(self.config.verbose),
             )
 
-
         self._save_model()
         self._history = history.history
         self._clean_workspace()
         return history
 
+    def convert_model(
+        self,
+        format_model: str = "onnx",
+        opset_onnx: int = 15,
+        output_path: str = "model",
+        from_command_line: bool = True,
+    ):
+
+        """Convert the model to a different format. Available formats:
+        1. ONNX
+        2. TensorFlow Lite
+
+        Args
+        ----
+        format: str
+            Format to convert the model to. Available formats: onnx, tflite
+        model_path: str
+            Path to the model to convert.
+        output_path: str
+            Path to the output file.
+        from_command_line: bool
+            Whether or not to convert the model using the command line.
+            It might be not available with some platforms.
+
+        """
+        if from_command_line and self._out_path is None:
+            raise Exception("You need to have a saved model before converting using command line")
+        model_to_convert = self.model if not from_command_line else None
+        converter = Converter(
+            out_format=format_model.lower(),
+            opset_onnx=opset_onnx,
+            model_path=self._out_path,
+            model=model_to_convert,
+            out_path = output_path,
+        )
+        if converter.convert():
+            print(f"Successfully converted to format {format} ")
+
     def _get_input_shape(self):
         """Get the input shape of input dataset"""
         if isinstance(self.train_dataset, tf.data.Dataset):
@@ -122,46 +164,55 @@ class Trainer:
         if self._model_checkpoint is not None:
             self.model.load_weights(self._model_checkpoint)
             if self._log:
-                self.model.save('/tmp/best_model')
+                self.model.save("/tmp/best_model")
                 run = mlflow.last_active_run()
-                idx = 7 #TODO: check is always the same
+                idx = 7  # TODO: check is always the same
                 artifact_path = run.info.artifact_uri[idx:]
-                self.model.save(artifact_path+f"/best_model")
+                self._out_path = artifact_path + f"/best_model"
+                self.model.save(self._out_path)
             else:
-                os.makedirs(os.getcwd()+'/models', exist_ok=True)
+                os.makedirs(os.getcwd() + "/models", exist_ok=True)
                 t = int(time.time())
-                self.model.save(os.getcwd()+f"/models/{t}_best_model")
+                self._out_path = os.getcwd() + f"/models/{t}_best_model"
+                self.model.save(self._out_path)
         else:
-            os.makedirs(os.getcwd()+'/models', exist_ok=True)
+            os.makedirs(os.getcwd() + "/models", exist_ok=True)
             t = int(time.time())
-            self.model.save(os.getcwd()+f"/models/{t}_best_model")
-    
+            self._out_path = os.getcwd() + f"/models/{t}_best_model"
+            self.model.save(self._out_path)
+
     def _clone_model(self):
         """Clone the model so that it works within tf.distribute.Strategy
-            It works only for models not using custom objects
+        It works only for models not using custom objects
         """
         self.model = tf.keras.models.clone_model(self.model)
 
-
     def _get_x_y_val(self, batch_size):
         """Get the x and y for training based on the format of the dataset"""
         options = tf.data.Options()
-        options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+        options.experimental_distribute.auto_shard_policy = (
+            tf.data.experimental.AutoShardPolicy.OFF
+        )
 
         if self.val_dataset is None:
             return None
         if isinstance(self.val_dataset, tf.data.Dataset):
             return self.val_dataset
         else:
-            val = tf.data.Dataset.from_tensor_slices(self.val_dataset)\
-                .batch(batch_size).with_options(options)
+            val = (
+                tf.data.Dataset.from_tensor_slices(self.val_dataset)
+                .batch(batch_size)
+                .with_options(options)
+            )
             return val
 
     def _get_x_y_train(self, batch_size):
         """Get the x and y for training based on the format of the dataset"""
         options = tf.data.Options()
-        options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
-    
+        options.experimental_distribute.auto_shard_policy = (
+            tf.data.experimental.AutoShardPolicy.OFF
+        )
+
         if isinstance(self.train_dataset, tf.data.Dataset):
             x = self.train_dataset
         else:
@@ -181,27 +232,30 @@ class Trainer:
         self._optimizer = self._get_optimizer()
         self._loss = self._get_loss()
         self._metrics = self._get_metrics()
-        
+
         self.model.compile(
             optimizer=self._optimizer, loss=self._loss, metrics=self._get_metrics()
         )
 
     def _get_strategy(self):
         if self.strategy is None:
-            if self._accelerator is None :
+            if self._accelerator is None:
                 return tf.distribute.MirroredStrategy()
             if self._accelerator is Accelerator.gpu:
                 index = Trainer.get_free_gpu_idx()
                 return tf.distribute.OneDeviceStrategy(f"/gpu:{index}")
-            if self._accelerator is Accelerator.cpu or self._accelerator is Accelerator.mgpu:
+            if (
+                self._accelerator is Accelerator.cpu
+                or self._accelerator is Accelerator.mgpu
+            ):
                 # If GPUs are not available, it will use CPUs
                 return tf.distribute.MirroredStrategy()
             if self._accelerator is Accelerator.tpu:
-                #TODO: check configuration for tpu strategy
+                # TODO: check configuration for tpu strategy
                 return tf.distribute.TPUStrategy()
         else:
             return self.strategy
-    
+
     @staticmethod
     def get_free_gpu_idx():
         """Get the index of the freer GPU"""
@@ -212,7 +266,6 @@ class Trainer:
         memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
         return int(np.argmin(memory_free_values))
 
-
     def _parse_config(self) -> Any:
         return yaml_to_pydantic(self.config_path, self.validate_yaml)
 
@@ -231,7 +284,7 @@ class Trainer:
         try:
             custom_dirpath = os.path.dirname(module_name)
             sys.path.append(custom_dirpath)
-            module = module_name.split('/')[-1].split('.')[0]
+            module = module_name.split("/")[-1].split(".")[0]
             self._custom_module = importlib.import_module(module)
         except Exception as e:
             raise ImportError(f"Error importing {module_name}: {e}")
@@ -284,20 +337,30 @@ class Trainer:
                 args = list(name_callback.values())[0]
 
                 # Track filepath if it's a ModelCheckpoint
-                self._model_checkpoint =args['filepath'] if key == 'ModelCheckpoint' else None
+                self._model_checkpoint = (
+                    args["filepath"] if key == "ModelCheckpoint" else None
+                )
 
                 callback_value = getattr(tf.keras.callbacks, key, default_value)
             else:
-                callback_value = getattr(tf.keras.callbacks, name_callback, default_value)
+                callback_value = getattr(
+                    tf.keras.callbacks, name_callback, default_value
+                )
                 args = None
 
             if callback_value != default_value:
-                callbacks.append(callback_value(**args) if args is not None else callback_value())
+                callbacks.append(
+                    callback_value(**args) if args is not None else callback_value()
+                )
             else:
                 if args is not None:
-                    if args.get('dataset') is not None:
-                        dataset = self.val_dataset if args['dataset'] in ['val','dev','validation'] else self.train_dataset
-                        args['dataset'] = dataset
+                    if args.get("dataset") is not None:
+                        dataset = (
+                            self.val_dataset
+                            if args["dataset"] in ["val", "dev", "validation"]
+                            else self.train_dataset
+                        )
+                        args["dataset"] = dataset
                     callbacks.append(self._load_custom_module(key, args))
                 else:
                     callbacks.append(self._load_custom_module(name_callback, args))
@@ -314,13 +377,13 @@ class Trainer:
         for metric in self.config.metrics:
             if metric is None:
                 continue
-            if isinstance(metric,str):   
+            if isinstance(metric, str):
                 args = None
                 metric_value = getattr(tf.keras.metrics, f"{metric}", default_value)()
             else:
                 m, args = list(metric.items())[0]
                 metric_value = getattr(tf.keras.metrics, f"{m}", default_value)(**args)
-            
+
             if metric_value != default_value:
                 list_metrics.append(metric_value)
             else:
@@ -358,8 +421,7 @@ class Trainer:
             else:
                 tf.random.set_seed(random_seed)
                 np.random.seed(random_seed)
-                
-                   
+
     def clear_ram(self):
         "Delete model and all datasets saved in the Trainer class"
         del self.model
@@ -373,8 +435,7 @@ class Trainer:
         self.model.summary()
 
 
-
-if __name__ == '__main__':
+if __name__ == "__main__":
     import numpy as np
     from tensorflow import keras
     from tensorflow.keras import layers
@@ -396,7 +457,6 @@ if __name__ == '__main__':
     print(x_train.shape[0], "train samples")
     print(x_test.shape[0], "test samples")
 
-
     # convert class vectors to binary class matrices
     y_train = keras.utils.to_categorical(y_train, num_classes)
     y_test = keras.utils.to_categorical(y_test, num_classes)
@@ -404,24 +464,27 @@ if __name__ == '__main__':
     strategy = tf.distribute.MirroredStrategy()
     with strategy.scope():
         model = keras.Sequential(
-        [
-            keras.Input(shape=input_shape),
-            layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
-            layers.MaxPooling2D(pool_size=(2, 2)),
-            layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
-            layers.MaxPooling2D(pool_size=(2, 2)),
-            layers.Flatten(),
-            layers.Dropout(0.5),
-            layers.Dense(num_classes, activation="softmax"),
-        ]
+            [
+                keras.Input(shape=input_shape),
+                layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
+                layers.MaxPooling2D(pool_size=(2, 2)),
+                layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
+                layers.MaxPooling2D(pool_size=(2, 2)),
+                layers.Flatten(),
+                layers.Dropout(0.5),
+                layers.Dense(num_classes, activation="softmax"),
+            ]
         )
 
     model.summary()
 
-    
+    path = "/root/project/yaket/examples/files/trainer.yaml"
 
-    path = '/root/project/yaket/examples/files/trainer.yaml'
-    
-    trainer = Trainer(config_path = path, train_dataset=(x_train, y_train), val_dataset=(x_test, y_test), \
-            model=model, strategy=strategy)
-    trainer.train()
\ No newline at end of file
+    trainer = Trainer(
+        config_path=path,
+        train_dataset=(x_train, y_train),
+        val_dataset=(x_test, y_test),
+        model=model,
+        strategy=strategy,
+    )
+    trainer.train()
-- 
GitLab


From 10a3ea73dbbee8338c3cc5dc474c34f3df012eea Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Mon, 25 Jul 2022 17:15:50 +0200
Subject: [PATCH 18/31] Remove output notebook

---
 examples/00-simple-mnist-convnet.ipynb | 99 ++++++++------------------
 1 file changed, 29 insertions(+), 70 deletions(-)

diff --git a/examples/00-simple-mnist-convnet.ipynb b/examples/00-simple-mnist-convnet.ipynb
index 207c5da..bf62606 100644
--- a/examples/00-simple-mnist-convnet.ipynb
+++ b/examples/00-simple-mnist-convnet.ipynb
@@ -2,19 +2,9 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2022-07-25 16:43:44.205743: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2022-07-25 16:43:44.212751: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
-      "2022-07-25 16:43:44.212796: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import numpy as np\n",
     "from tensorflow import keras\n",
@@ -24,19 +14,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "x_train shape: (60000, 28, 28, 1)\n",
-      "60000 train samples\n",
-      "10000 test samples\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Model / data parameters\n",
     "num_classes = 10\n",
@@ -63,52 +43,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Model: \"sequential\"\n",
-      "_________________________________________________________________\n",
-      " Layer (type)                Output Shape              Param #   \n",
-      "=================================================================\n",
-      " conv2d (Conv2D)             (None, 26, 26, 32)        320       \n",
-      "                                                                 \n",
-      " max_pooling2d (MaxPooling2D  (None, 13, 13, 32)       0         \n",
-      " )                                                               \n",
-      "                                                                 \n",
-      " conv2d_1 (Conv2D)           (None, 11, 11, 64)        18496     \n",
-      "                                                                 \n",
-      " max_pooling2d_1 (MaxPooling  (None, 5, 5, 64)         0         \n",
-      " 2D)                                                             \n",
-      "                                                                 \n",
-      " flatten (Flatten)           (None, 1600)              0         \n",
-      "                                                                 \n",
-      " dropout (Dropout)           (None, 1600)              0         \n",
-      "                                                                 \n",
-      " dense (Dense)               (None, 10)                16010     \n",
-      "                                                                 \n",
-      "=================================================================\n",
-      "Total params: 34,826\n",
-      "Trainable params: 34,826\n",
-      "Non-trainable params: 0\n",
-      "_________________________________________________________________\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2022-07-25 16:43:47.963880: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory\n",
-      "2022-07-25 16:43:47.964009: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)\n",
-      "2022-07-25 16:43:47.964048: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (dc8739c0aa9a): /proc/driver/nvidia/version does not exist\n",
-      "2022-07-25 16:43:47.964632: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA\n",
-      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "model = keras.Sequential(\n",
     "    [\n",
@@ -126,6 +63,19 @@
     "model.summary()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from yaket.converter.converter import Converter\n",
+    "\n",
+    "path_m = '/root/project/yaket/models/1658392865_best_model'\n",
+    "c = Converter(out_format='tflite',model = None, model_path=path_m, out_path='model')\n",
+    "c.convert()\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -143,7 +93,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "trainer._init_trainer()"
+    "trainer.train(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer.convert_model(format_model = 'onnx', )"
    ]
   }
  ],
-- 
GitLab


From 1af72bf22cea26f356180c3ad50c57b917295aca Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Wed, 27 Jul 2022 15:26:10 +0200
Subject: [PATCH 19/31] Add schema for lr_scheduler, parameter loss, and remove
 strategy from main

---
 .gitignore                          |   2 +
 examples/02-bit-image.ipynb         | 346 ++++++++++++++++++++++++++++
 examples/Untitled-1.py              | 175 ++++++++++++++
 examples/files/01_custom_modules.py |   5 +
 examples/files/02_trainer.yaml      |  22 ++
 yaket/converter/converter.py        |   1 -
 yaket/schema/schema.py              |   7 +-
 yaket/trainer.py                    | 113 ++++++---
 8 files changed, 633 insertions(+), 38 deletions(-)
 create mode 100644 examples/02-bit-image.ipynb
 create mode 100644 examples/Untitled-1.py
 create mode 100644 examples/files/02_trainer.yaml

diff --git a/.gitignore b/.gitignore
index d2124ee..f9ac6b7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,5 @@ yaket/converter/__pycache__/
 *.onnx
 *.pb
 *.tflite
+examples/~/
+~/
diff --git a/examples/02-bit-image.ipynb b/examples/02-bit-image.ipynb
new file mode 100644
index 0000000..e2eaa2c
--- /dev/null
+++ b/examples/02-bit-image.ipynb
@@ -0,0 +1,346 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-07-27 10:38:10.382444: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2022-07-27 10:38:10.387014: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
+      "2022-07-27 10:38:10.387027: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "import tensorflow as tf\n",
+    "from tensorflow import keras\n",
+    "import tensorflow_hub as hub\n",
+    "import tensorflow_datasets as tfds\n",
+    "\n",
+    "tfds.disable_progress_bar()\n",
+    "\n",
+    "SEEDS = 42\n",
+    "\n",
+    "np.random.seed(SEEDS)\n",
+    "tf.random.set_seed(SEEDS)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_ds, validation_ds = tfds.load(\n",
+    "    \"tf_flowers\",\n",
+    "    split=[\"train[:85%]\", \"train[85%:]\"],\n",
+    "    as_supervised=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'PrefetchDataset' object has no attribute 'shape'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[1;32m/root/project/yaket/examples/02-bit-image.ipynb Cell 3\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> <a href='vscode-notebook-cell://attached-container%2B7b22636f6e7461696e65724e616d65223a222f7562756e74752d61614b45227d/root/project/yaket/examples/02-bit-image.ipynb#ch0000017vscode-remote?line=0'>1</a>\u001b[0m train_ds\u001b[39m.\u001b[39;49mshape\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'PrefetchDataset' object has no attribute 'shape'"
+     ]
+    }
+   ],
+   "source": [
+    "train_ds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(10, 10))\n",
+    "for i, (image, label) in enumerate(train_ds.take(9)):\n",
+    "    ax = plt.subplot(3, 3, i + 1)\n",
+    "    plt.imshow(image)\n",
+    "    plt.title(int(label))\n",
+    "    plt.axis(\"off\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RESIZE_TO = 384\n",
+    "CROP_TO = 224\n",
+    "BATCH_SIZE = 64\n",
+    "STEPS_PER_EPOCH = 10\n",
+    "AUTO = tf.data.AUTOTUNE  # optimise the pipeline performance\n",
+    "NUM_CLASSES = 5  # number of classes\n",
+    "SCHEDULE_LENGTH = (\n",
+    "    500  # we will train on lower resolution images and will still attain good results\n",
+    ")\n",
+    "SCHEDULE_BOUNDARIES = [\n",
+    "    200,\n",
+    "    300,\n",
+    "    400,\n",
+    "]  # more the dataset size the schedule length increase"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SCHEDULE_LENGTH = SCHEDULE_LENGTH * 512 / BATCH_SIZE\n",
+    "\n",
+    "\n",
+    "@tf.function\n",
+    "def preprocess_train(image, label):\n",
+    "    image = tf.image.random_flip_left_right(image)\n",
+    "    image = tf.image.resize(image, (RESIZE_TO, RESIZE_TO))\n",
+    "    image = tf.image.random_crop(image, (CROP_TO, CROP_TO, 3))\n",
+    "    image = image / 255.0\n",
+    "    return (image, label)\n",
+    "\n",
+    "\n",
+    "@tf.function\n",
+    "def preprocess_test(image, label):\n",
+    "    image = tf.image.resize(image, (RESIZE_TO, RESIZE_TO))\n",
+    "    image = image / 255.0\n",
+    "    return (image, label)\n",
+    "\n",
+    "\n",
+    "DATASET_NUM_TRAIN_EXAMPLES = train_ds.cardinality().numpy()\n",
+    "\n",
+    "repeat_count = int(\n",
+    "    SCHEDULE_LENGTH * BATCH_SIZE / DATASET_NUM_TRAIN_EXAMPLES * STEPS_PER_EPOCH\n",
+    ")\n",
+    "repeat_count += 50 + 1  # To ensure at least there are 50 epochs of training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Training pipeline\n",
+    "pipeline_train = (\n",
+    "    train_ds.shuffle(10000)\n",
+    "    .repeat(repeat_count)  # Repeat dataset_size / num_steps\n",
+    "    .map(preprocess_train, num_parallel_calls=AUTO)\n",
+    "    .batch(BATCH_SIZE)\n",
+    "    .prefetch(AUTO)\n",
+    ")\n",
+    "\n",
+    "# Validation pipeline\n",
+    "pipeline_validation = (\n",
+    "    validation_ds.map(preprocess_test, num_parallel_calls=AUTO)\n",
+    "    .batch(BATCH_SIZE)\n",
+    "    .prefetch(AUTO)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_batch, label_batch = next(iter(pipeline_train))\n",
+    "\n",
+    "plt.figure(figsize=(10, 10))\n",
+    "for n in range(25):\n",
+    "    ax = plt.subplot(5, 5, n + 1)\n",
+    "    plt.imshow(image_batch[n])\n",
+    "    plt.title(label_batch[n].numpy())\n",
+    "    plt.axis(\"off\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bit_model_url = \"https://tfhub.dev/google/bit/m-r50x1/1\"\n",
+    "bit_module = hub.KerasLayer(bit_model_url)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class MyBiTModel(keras.Model):\n",
+    "    def __init__(self, num_classes, module, **kwargs):\n",
+    "        super().__init__(**kwargs)\n",
+    "\n",
+    "        self.num_classes = num_classes\n",
+    "        self.head = keras.layers.Dense(num_classes, kernel_initializer=\"zeros\")\n",
+    "        self.bit_model = module\n",
+    "\n",
+    "    def call(self, images):\n",
+    "        bit_embedding = self.bit_model(images)\n",
+    "        return self.head(bit_embedding)\n",
+    "\n",
+    "\n",
+    "model = MyBiTModel(num_classes=NUM_CLASSES, module=bit_module)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "keras.optimizers.schedules.learning_rate_schedule.PiecewiseConstantDecay"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "getattr(keras.optimizers.schedules,'PiecewiseConstantDecay')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "learning_rate = 0.003 * BATCH_SIZE / 512\n",
+    "\n",
+    "# Decay learning rate by a factor of 10 at SCHEDULE_BOUNDARIES.\n",
+    "lr_schedule = keras.optimizers.schedules.PiecewiseConstantDecay(\n",
+    "    boundaries=SCHEDULE_BOUNDARIES,\n",
+    "    values=[\n",
+    "        learning_rate,\n",
+    "        learning_rate * 0.1,\n",
+    "        learning_rate * 0.01,\n",
+    "        learning_rate * 0.001,\n",
+    "    ],\n",
+    ")\n",
+    "optimizer = keras.optimizers.SGD(learning_rate=lr_schedule, momentum=0.9)\n",
+    "\n",
+    "loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.compile(optimizer=optimizer, loss=loss_fn, metrics=[\"accuracy\"])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_callbacks = [\n",
+    "    keras.callbacks.EarlyStopping(\n",
+    "        monitor=\"val_accuracy\", patience=2, restore_best_weights=True\n",
+    "    )\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "history = model.fit(\n",
+    "    pipeline_train,\n",
+    "    batch_size=BATCH_SIZE,\n",
+    "    epochs=1,\n",
+    "    steps_per_epoch=STEPS_PER_EPOCH,\n",
+    "    validation_data=pipeline_validation,\n",
+    "    callbacks=train_callbacks,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "accuracy = model.evaluate(pipeline_validation)[1] * 100\n",
+    "print(\"Accuracy: {:.2f}%\".format(accuracy))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.4 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/Untitled-1.py b/examples/Untitled-1.py
new file mode 100644
index 0000000..7996314
--- /dev/null
+++ b/examples/Untitled-1.py
@@ -0,0 +1,175 @@
+# %%
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+import tensorflow as tf
+from tensorflow import keras
+import tensorflow_hub as hub
+import tensorflow_datasets as tfds
+
+tfds.disable_progress_bar()
+
+SEEDS = 42
+
+np.random.seed(SEEDS)
+tf.random.set_seed(SEEDS)
+
+# %%
+train_ds, validation_ds = tfds.load(
+    "tf_flowers",
+    split=["train[:85%]", "train[85%:]"],
+    as_supervised=True,
+)
+
+# # %%
+# plt.figure(figsize=(10, 10))
+# for i, (image, label) in enumerate(train_ds.take(9)):
+#     ax = plt.subplot(3, 3, i + 1)
+#     plt.imshow(image)
+#     plt.title(int(label))
+#     plt.axis("off")
+    
+
+# %%
+RESIZE_TO = 384
+CROP_TO = 224
+BATCH_SIZE = 64
+STEPS_PER_EPOCH = 10
+AUTO = tf.data.AUTOTUNE  # optimise the pipeline performance
+NUM_CLASSES = 5  # number of classes
+SCHEDULE_LENGTH = (
+    500  # we will train on lower resolution images and will still attain good results
+)
+SCHEDULE_BOUNDARIES = [
+    200,
+    300,
+    400,
+]  # more the dataset size the schedule length increase
+
+# %%
+SCHEDULE_LENGTH = SCHEDULE_LENGTH * 512 / BATCH_SIZE
+
+
+@tf.function
+def preprocess_train(image, label):
+    image = tf.image.random_flip_left_right(image)
+    image = tf.image.resize(image, (RESIZE_TO, RESIZE_TO))
+    image = tf.image.random_crop(image, (CROP_TO, CROP_TO, 3))
+    image = image / 255.0
+    return (image, label)
+
+
+@tf.function
+def preprocess_test(image, label):
+    image = tf.image.resize(image, (RESIZE_TO, RESIZE_TO))
+    image = image / 255.0
+    return (image, label)
+
+
+DATASET_NUM_TRAIN_EXAMPLES = train_ds.cardinality().numpy()
+
+repeat_count = int(
+    SCHEDULE_LENGTH * BATCH_SIZE / DATASET_NUM_TRAIN_EXAMPLES * STEPS_PER_EPOCH
+)
+repeat_count += 50 + 1  # To ensure at least there are 50 epochs of training
+
+# # %%
+# # Training pipeline
+pipeline_train = (
+    train_ds.shuffle(10000)
+    .repeat(repeat_count)  # Repeat dataset_size / num_steps
+    .map(preprocess_train, num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+# Validation pipeline
+pipeline_validation = (
+    validation_ds.map(preprocess_test, num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+# # %%
+# image_batch, label_batch = next(iter(pipeline_train))
+
+# plt.figure(figsize=(10, 10))
+# for n in range(25):
+#     ax = plt.subplot(5, 5, n + 1)
+#     plt.imshow(image_batch[n])
+#     plt.title(label_batch[n].numpy())
+#     plt.axis("off")
+
+# # %%
+bit_model_url = "https://tfhub.dev/google/bit/m-r50x1/1"
+bit_module = hub.KerasLayer(bit_model_url)
+
+# %%
+class MyBiTModel(tf.keras.Model):
+    def __init__(self, num_classes, module, **kwargs):
+        super().__init__(**kwargs)
+
+        self.num_classes = num_classes
+        self.head = keras.layers.Dense(num_classes, kernel_initializer="zeros")
+        self.bit_model = module
+
+    def call(self, images):
+        bit_embedding = self.bit_model(images)
+        return self.head(bit_embedding)
+
+
+model = MyBiTModel(num_classes=5, module=bit_module)
+
+from yaket.trainer import Trainer
+
+path = '/root/project/yaket/examples/files/02_trainer.yaml'
+trainer = Trainer(path, model = model, train_dataset = pipeline_train, val_dataset = pipeline_validation)
+trainer.train(1)
+
+# # %%
+# learning_rate = 0.003 * BATCH_SIZE / 512
+
+# # Decay learning rate by a factor of 10 at SCHEDULE_BOUNDARIES.
+# lr_schedule = keras.optimizers.schedules.PiecewiseConstantDecay(
+#     boundaries=SCHEDULE_BOUNDARIES,
+#     values=[
+#         learning_rate,
+#         learning_rate * 0.1,
+#         learning_rate * 0.01,
+#         learning_rate * 0.001,
+#     ],
+# )
+# optimizer = keras.optimizers.SGD(learning_rate=lr_schedule, momentum=0.9)
+
+# loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+
+# # %%
+# model.compile(optimizer=optimizer, loss=loss_fn, metrics=["accuracy"])
+
+
+# # %%
+# train_callbacks = [
+#     keras.callbacks.EarlyStopping(
+#         monitor="val_accuracy", patience=2, restore_best_weights=True
+#     )
+# ]
+
+# # %%
+# history = model.fit(
+#     pipeline_train,
+#     batch_size=BATCH_SIZE,
+#     epochs=1,
+#     steps_per_epoch=STEPS_PER_EPOCH,
+#     validation_data=pipeline_validation,
+#     callbacks=train_callbacks,
+# )
+
+# # %%
+# accuracy = model.evaluate(pipeline_validation)[1] * 100
+# print("Accuracy: {:.2f}%".format(accuracy))
+
+# # %%
+
+
+
diff --git a/examples/files/01_custom_modules.py b/examples/files/01_custom_modules.py
index bcfb9cc..d22cca5 100644
--- a/examples/files/01_custom_modules.py
+++ b/examples/files/01_custom_modules.py
@@ -3,6 +3,8 @@ import tensorflow as tf
 from tensorflow import keras
 from jiwer import wer
 
+
+########################################################## 02-ASR ##########################################################
 # The set of characters accepted in the transcription.
 characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
 # Mapping characters to integers
@@ -65,3 +67,6 @@ def CTCLoss(y_true, y_pred):
 
     loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
     return loss
+
+
+########################################################## 03- BiT ##########################################################
diff --git a/examples/files/02_trainer.yaml b/examples/files/02_trainer.yaml
new file mode 100644
index 0000000..233c793
--- /dev/null
+++ b/examples/files/02_trainer.yaml
@@ -0,0 +1,22 @@
+autolog: False
+optimizer: 
+  - Adam
+  - PiecewiseConstantDecay: 
+      boundaries: [200, 300, 400]
+      values:  [0.003, 0.0003,0.00003,0.000003]
+batch_size: 64 
+loss: 
+  SparseCategoricalCrossentropy: 
+      from_logits: True
+callbacks:
+    - EarlyStopping:
+        monitor: val_accuracy
+        patience: 2
+        restore_best_weights: True  
+verbose: 1 # 0, 1, 2 
+epochs: 1
+# shuffle: False
+class_weights: # First value is boolean
+    - False
+accelerator: gpu #Make it optional
+steps_per_epoch: 10
\ No newline at end of file
diff --git a/yaket/converter/converter.py b/yaket/converter/converter.py
index d991c04..135b798 100644
--- a/yaket/converter/converter.py
+++ b/yaket/converter/converter.py
@@ -39,7 +39,6 @@ class Converter:
         "Function to convert a keras model to onnx using command line"
         # submit command to command line
 
-        
         if not self.out_path.endswith(".onnx"):
             self.out_path = self.out_path + ".onnx"
         try:
diff --git a/yaket/schema/schema.py b/yaket/schema/schema.py
index 0c29322..9eabe31 100644
--- a/yaket/schema/schema.py
+++ b/yaket/schema/schema.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional, Any, Tuple, Union
+from typing import Dict, Optional, Any, Tuple, Union, List
 import yaml
 import os
 from pydantic import (
@@ -28,17 +28,18 @@ class Accelerator(Enum):
 
 class TrainingModel(BaseModel, extra=Extra.allow):
     autolog: bool = False
-    optimizer: constr(strict=True) = 'Adam'
+    optimizer: Union[constr(strict=True), conlist(item_type=Any,min_items = 1, max_items = 2)] = 'Adam'
     optimizer_params: Optional[Dict[str, Any]] = None
     epochs: PositiveInt = 1
     batch_size: PositiveInt = 1  # if format is numpy
-    loss: constr(strict=True)
+    loss: Union[constr(strict=True), Dict[str, Any]]
     callbacks: Optional[conlist(item_type=Union[str,Dict[str, Any]], min_items=0)] 
     metrics: Optional[conlist(item_type=Union[str,Dict], min_items=1, unique_items=True)]
     verbose: conint(ge=1, le=2) = 1
     shuffle: bool = True
     class_weights: Optional[conlist(item_type=Any, min_items=1)] 
     accelerator: Optional[constr(strict=True)] 
+    steps_per_epoch: Optional[PositiveInt] = None
 
     @validator('accelerator')
     def accelerator_validator(cls, v):
diff --git a/yaket/trainer.py b/yaket/trainer.py
index 4d713de..afff8b9 100644
--- a/yaket/trainer.py
+++ b/yaket/trainer.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
 from enum import Enum, auto
+from optparse import OptionParser
 
 from typing import List, Optional, Tuple, Union, Any, Dict, Callable
 import numpy as np
@@ -72,6 +73,23 @@ class Trainer:
         self._callbacks = self._get_callbacks()
         self._input_shape = self._get_input_shape()
 
+    def _train(self, train_dataset, val_dataset, epochs: int = None):
+        """Train the model"""
+
+        self._compile_model()
+        history = self.model.fit(
+            x=train_dataset,
+            y=None,
+            epochs=int(self.config.epochs) if epochs is None else epochs,
+            validation_data=val_dataset,
+            batch_size=None,
+            callbacks=self._callbacks,
+            steps_per_epoch=int(self.config.steps_per_epoch),
+            class_weight=None,  # TODO: add class_weight,
+            verbose=int(self.config.verbose),
+        )
+        return history
+
     def train(self, epochs: int = None):
         """Train the model. Main function to call.
 
@@ -85,27 +103,17 @@ class Trainer:
         self._init_trainer()
         self._autolog()
 
-        strategy = self._get_strategy()
-        batch_size = strategy.num_replicas_in_sync * self.config.batch_size
-        train_dataset = self._get_x_y_train(batch_size)
-        val_dataset = self._get_x_y_val(batch_size)
-
-        with strategy.scope():
-
-            if self.strategy is None:
-                self._clone_model()
-            self._compile_model()
-
-            history = self.model.fit(
-                x=train_dataset,
-                y=None,
-                epochs=int(self.config.epochs) if epochs is None else epochs,
-                validation_data=val_dataset,
-                batch_size=None,
-                callbacks=self._callbacks,
-                class_weight=None,  # TODO: add class_weight,
-                verbose=int(self.config.verbose),
-            )
+        if self.strategy is None and self.config.accelerator is Accelerator.cpu:
+            train_dataset = self._get_x_y_train(self.config.batch_size)
+            val_dataset = self._get_x_y_val(self.config.batch_size)
+            history = self._train(train_dataset, val_dataset, epochs)
+        else:
+            strategy = self._get_strategy()
+            batch_size = strategy.num_replicas_in_sync * self.config.batch_size
+            train_dataset = self._get_x_y_train(batch_size)
+            val_dataset = self._get_x_y_val(batch_size)
+            with strategy.scope():
+                history = self._train(train_dataset, val_dataset, epochs)
 
         self._save_model()
         self._history = history.history
@@ -302,27 +310,64 @@ class Trainer:
 
     def _get_optimizer(self) -> tf.keras.optimizers.Optimizer:
         """Get the optimizer from the config file"""
-        opt_pars = self.config.optimizer_params
         default_value = "not_found"
-        optimizer = getattr(
+        opt_pars = dict()
+        opt = self.config.optimizer
+        if isinstance(opt, List):
+            if len(opt) == 1 and isinstance(opt[0], Dict):
+                k = list(opt[1].keys())[0]
+                v = list(opt[1].values())[0]
+                scheduler = getattr(tf.keras.optimizers, k, default_value)
+                if optimizer != default_value:
+                    return optimizer(**opt_pars)
+                else:
+                    return self._load_custom_module(optimizer, opt_pars)
+            else:
+                opt_name = opt[0]
+                optimizer = getattr(
+                    tf.keras.optimizers, f"{opt_name}", default_value
+                )
+                if optimizer == default_value:
+                    optimizer = self._load_custom_module(optimizer)
+                if isinstance(opt[1], dict):
+                    k = list(opt[1].keys())[0]
+                    v = list(opt[1].values())[0]
+                    scheduler = getattr(tf.keras.optimizers.schedules, k, default_value)
+                    if scheduler == default_value:
+                        raise ValueError(f"{v} is not a valid scheduler. Only available from keras")
+                    else:
+                        opt_pars['learning_rate'] = scheduler(**v)
+                        return optimizer(**opt_pars)
+        elif isinstance(opt, str):
+            optimizer = getattr(
             tf.keras.optimizers, f"{self.config.optimizer}", default_value
-        )
-        if optimizer != default_value:
-            return optimizer(**opt_pars)
-        else:
-            return self._load_custom_module(optimizer, opt_pars)
+            )
+            if optimizer != default_value:
+                return optimizer(**opt_pars)
+            else:
+                    return self._load_custom_module(optimizer, opt_pars)
 
     def _get_loss(self) -> Union[tf.keras.losses.Loss, Callable]:
         """Get the loss from the config file"""
 
-        loss_name = self.config.loss
+        loss_config = self.config.loss
         default_value = "not_found"
 
-        loss = getattr(tf.keras.losses, loss_name, default_value)
-        if loss != default_value:
-            return loss()
-        else:  # it's a custom loss
-            return self._load_custom_module(loss_name)
+        if isinstance(loss_config, str):
+            loss = getattr(tf.keras.losses, loss_config, default_value)
+            if loss != default_value:
+                return loss()
+            else:  # it's a custom loss
+                return self._load_custom_module(loss_config)
+        elif isinstance(loss_config, Dict):
+            loss_name = list(loss_config.keys())[0]
+            loss_params = list(loss_config.values())[0]
+            loss = getattr(tf.keras.losses, loss_name, default_value)
+            if loss != default_value:
+                return loss(**loss_params)
+            else:
+                return self._load_custom_module(loss_config, loss_params)
+
 
     def _get_callbacks(self) -> List[tf.keras.callbacks.Callback]:
         """Get the callbacks from the config file"""
-- 
GitLab


From c93b0f0b763da2b6700e926a97f54e691ee0aafe Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Wed, 27 Jul 2022 15:46:00 +0200
Subject: [PATCH 20/31] Check existence gpu, disable autoshard input_dataset,
 make dev set optional

---
 examples/Untitled-1.py         | 2 +-
 examples/files/02_trainer.yaml | 4 ++--
 yaket/schema/schema.py         | 6 ++++++
 yaket/trainer.py               | 8 ++++----
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/examples/Untitled-1.py b/examples/Untitled-1.py
index 7996314..7a034de 100644
--- a/examples/Untitled-1.py
+++ b/examples/Untitled-1.py
@@ -124,7 +124,7 @@ model = MyBiTModel(num_classes=5, module=bit_module)
 from yaket.trainer import Trainer
 
 path = '/root/project/yaket/examples/files/02_trainer.yaml'
-trainer = Trainer(path, model = model, train_dataset = pipeline_train, val_dataset = pipeline_validation)
+trainer = Trainer(path, model = model, train_dataset = pipeline_train)
 trainer.train(1)
 
 # # %%
diff --git a/examples/files/02_trainer.yaml b/examples/files/02_trainer.yaml
index 233c793..3a1c8df 100644
--- a/examples/files/02_trainer.yaml
+++ b/examples/files/02_trainer.yaml
@@ -18,5 +18,5 @@ epochs: 1
 # shuffle: False
 class_weights: # First value is boolean
     - False
-accelerator: gpu #Make it optional
-steps_per_epoch: 10
\ No newline at end of file
+accelerator: cpu #Make it optional
+steps_per_epoch: 1
\ No newline at end of file
diff --git a/yaket/schema/schema.py b/yaket/schema/schema.py
index 9eabe31..b2eb881 100644
--- a/yaket/schema/schema.py
+++ b/yaket/schema/schema.py
@@ -24,6 +24,10 @@ class Accelerator(Enum):
     @classmethod
     def list(cls):
         return list(map(lambda c: c.name, cls))
+    @classmethod
+    def list_gpu(cls):
+        return list(map(lambda c: c.name if 'gpu' in c.name else '', cls))
+
 
 
 class TrainingModel(BaseModel, extra=Extra.allow):
@@ -47,6 +51,8 @@ class TrainingModel(BaseModel, extra=Extra.allow):
             return None
         if v not in Accelerator.list():
             raise ValueError(f'{v} is not a valid accelerator.\nPlease use: {Accelerator.list()}')
+        if v in Accelerator.list_gpu() and not os.environ.get('CUDA_VISIBLE_DEVICES'):
+            raise ValueError('ERROR: No GPU has been detected. Change accelerator.')
         return v
 
 
diff --git a/yaket/trainer.py b/yaket/trainer.py
index afff8b9..b3e0697 100644
--- a/yaket/trainer.py
+++ b/yaket/trainer.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from enum import Enum, auto
-from optparse import OptionParser
+import warnings
 
 from typing import List, Optional, Tuple, Union, Any, Dict, Callable
 import numpy as np
@@ -21,7 +21,7 @@ class Trainer:
     config_path: str
     model: tf.keras.Model
     train_dataset: Union[Tuple[np.ndarray, np.ndarray], tf.data.Dataset]
-    val_dataset: Union[Tuple[np.ndarray, np.ndarray], tf.data.Dataset]
+    val_dataset: Union[Tuple[np.ndarray, np.ndarray], tf.data.Dataset] = None
     strategy: Optional[tf.distribute.Strategy] = None
     random_seed: int = 1234
     validate_yaml: bool = True
@@ -205,7 +205,7 @@ class Trainer:
         if self.val_dataset is None:
             return None
         if isinstance(self.val_dataset, tf.data.Dataset):
-            return self.val_dataset
+            return self.val_dataset.with_options(options)
         else:
             val = (
                 tf.data.Dataset.from_tensor_slices(self.val_dataset)
@@ -222,7 +222,7 @@ class Trainer:
         )
 
         if isinstance(self.train_dataset, tf.data.Dataset):
-            x = self.train_dataset
+            x = self.train_dataset.with_options(options)
         else:
             x = tf.data.Dataset.from_tensor_slices(self.train_dataset)
             if self.config.shuffle:
-- 
GitLab


From c7fa80e394e76e38bd7a124e9672770282fe67dc Mon Sep 17 00:00:00 2001
From: Andrea Favia <andrea.favia@sleepiz.com>
Date: Thu, 28 Jul 2022 06:52:55 +0000
Subject: [PATCH 21/31] Update README.md

---
 README.md | 89 ++++++++-----------------------------------------------
 1 file changed, 12 insertions(+), 77 deletions(-)

diff --git a/README.md b/README.md
index 1683aa6..3ada96a 100644
--- a/README.md
+++ b/README.md
@@ -1,92 +1,27 @@
-# keras yaml trainer
+# YAKET: Yaml Keras Trainer (or Yet Another Keras Trainer)
 
-
-
-## Getting started
-
-To make it easy for you to get started with GitLab, here's a list of recommended next steps.
-
-Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)!
-
-## Add your files
-
-- [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files
-- [ ] [Add files using the command line](https://docs.gitlab.com/ee/gitlab-basics/add-file.html#add-a-file-using-the-command-line) or push an existing Git repository with the following command:
-
-```
-cd existing_repo
-git remote add origin https://gitlab.com/andreafavia/keras-yaml-trainer.git
-git branch -M main
-git push -uf origin main
-```
-
-## Integrate with your tools
-
-- [ ] [Set up project integrations](https://gitlab.com/andreafavia/keras-yaml-trainer/-/settings/integrations)
-
-## Collaborate with your team
-
-- [ ] [Invite team members and collaborators](https://docs.gitlab.com/ee/user/project/members/)
-- [ ] [Create a new merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html)
-- [ ] [Automatically close issues from merge requests](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically)
-- [ ] [Enable merge request approvals](https://docs.gitlab.com/ee/user/project/merge_requests/approvals/)
-- [ ] [Automatically merge when pipeline succeeds](https://docs.gitlab.com/ee/user/project/merge_requests/merge_when_pipeline_succeeds.html)
-
-## Test and Deploy
-
-Use the built-in continuous integration in GitLab.
-
-- [ ] [Get started with GitLab CI/CD](https://docs.gitlab.com/ee/ci/quick_start/index.html)
-- [ ] [Analyze your code for known vulnerabilities with Static Application Security Testing(SAST)](https://docs.gitlab.com/ee/user/application_security/sast/)
-- [ ] [Deploy to Kubernetes, Amazon EC2, or Amazon ECS using Auto Deploy](https://docs.gitlab.com/ee/topics/autodevops/requirements.html)
-- [ ] [Use pull-based deployments for improved Kubernetes management](https://docs.gitlab.com/ee/user/clusters/agent/)
-- [ ] [Set up protected environments](https://docs.gitlab.com/ee/ci/environments/protected_environments.html)
-
-***
-
-# Editing this README
-
-When you're ready to make this README your own, just edit this file and use the handy template below (or feel free to structure it however you want - this is just a starting point!). Thank you to [makeareadme.com](https://www.makeareadme.com/) for this template.
-
-## Suggestions for a good README
-Every project is different, so consider which of these sections apply to yours. The sections used in the template are suggestions for most open source projects. Also keep in mind that while a README can be too long and detailed, too long is better than too short. If you think your README is too long, consider utilizing another form of documentation rather than cutting out information.
-
-## Name
-Choose a self-explaining name for your project.
+## Project status
+Work In Progress. 
 
 ## Description
-Let people know what your project can do specifically. Provide context and add a link to any reference visitors might be unfamiliar with. A list of Features or a Background subsection can also be added here. If there are alternatives to your project, this is a good place to list differentiating factors.
+Yaket is a lightweight and simple module to train Keras modules by defining parameters directly using YAML file.
+This allows developer to focus uniquely on what matters: data and model development.
+By having parameters defined in a human-readable format, it is possible to have an holistic view of training procedure without opening the code.
+Morevoer, Data Scientists and ML Engineer won't need to add manually all training parameters, such as optimizer, callbacks, schedulers, thus reducing the
+likelihood of human-induced code bugs.
 
 ## Badges
-On some READMEs, you may see small images that convey metadata, such as whether or not all the tests are passing for the project. You can use Shields to add some to your README. Many services also have instructions for adding a badge.
+TODO: Tests are not covering the code YET.
 
 ## Visuals
-Depending on what you are making, it can be a good idea to include screenshots or even a video (you'll frequently see GIFs rather than actual videos). Tools like ttygif can help, but check out Asciinema for a more sophisticated method.
+
 
 ## Installation
-Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection.
+    pip install yaket
 
 ## Usage
 Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README.
 
-## Support
-Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc.
-
-## Roadmap
-If you have ideas for releases in the future, it is a good idea to list them in the README.
-
-## Contributing
-State if you are open to contributions and what your requirements are for accepting them.
-
-For people who want to make changes to your project, it's helpful to have some documentation on how to get started. Perhaps there is a script that they should run or some environment variables that they need to set. Make these steps explicit. These instructions could also be useful to your future self.
-
-You can also document commands to lint the code or run tests. These steps help to ensure high code quality and reduce the likelihood that the changes inadvertently break something. Having instructions for running tests is especially helpful if it requires external setup, such as starting a Selenium server for testing in a browser.
-
-## Authors and acknowledgment
-Show your appreciation to those who have contributed to the project.
-
 ## License
-For open source projects, say how it is licensed.
+MIT License
 
-## Project status
-If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers.
-- 
GitLab


From 36d503bf0a3138ecd5d95b96f711590837fa30db Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Thu, 28 Jul 2022 09:00:19 +0200
Subject: [PATCH 22/31] Fix _accelerator variable

---
 yaket/trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/yaket/trainer.py b/yaket/trainer.py
index b3e0697..aad0d38 100644
--- a/yaket/trainer.py
+++ b/yaket/trainer.py
@@ -84,7 +84,7 @@ class Trainer:
             validation_data=val_dataset,
             batch_size=None,
             callbacks=self._callbacks,
-            steps_per_epoch=int(self.config.steps_per_epoch),
+            steps_per_epoch=int(self.config.steps_per_epoch) if self.config.steps_per_epoch is not None else None,
             class_weight=None,  # TODO: add class_weight,
             verbose=int(self.config.verbose),
         )
@@ -103,7 +103,7 @@ class Trainer:
         self._init_trainer()
         self._autolog()
 
-        if self.strategy is None and self.config.accelerator is Accelerator.cpu:
+        if self.strategy is None and self._accelerator is Accelerator.cpu:
             train_dataset = self._get_x_y_train(self.config.batch_size)
             val_dataset = self._get_x_y_val(self.config.batch_size)
             history = self._train(train_dataset, val_dataset, epochs)
-- 
GitLab


From a7fc1b6b8beadb2acd600565d3db810fcde1536f Mon Sep 17 00:00:00 2001
From: Andrea Favia <andrea.favia@sleepiz.com>
Date: Thu, 28 Jul 2022 07:02:24 +0000
Subject: [PATCH 23/31] Update examples/files/02_trainer.yaml

---
 examples/files/02_trainer.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/files/02_trainer.yaml b/examples/files/02_trainer.yaml
index 3a1c8df..24214e8 100644
--- a/examples/files/02_trainer.yaml
+++ b/examples/files/02_trainer.yaml
@@ -13,10 +13,10 @@ callbacks:
         monitor: val_accuracy
         patience: 2
         restore_best_weights: True  
-verbose: 1 # 0, 1, 2 
+verbose: 1 
 epochs: 1
-# shuffle: False
-class_weights: # First value is boolean
+shuffle: False
+class_weights: 
     - False
-accelerator: cpu #Make it optional
-steps_per_epoch: 1
\ No newline at end of file
+accelerator: cpu 
+steps_per_epoch: 1
-- 
GitLab


From 44782e9d09aa6898711e579e760b1e208e2fd5a7 Mon Sep 17 00:00:00 2001
From: Andrea Favia <andrea.favia@sleepiz.com>
Date: Thu, 28 Jul 2022 07:03:50 +0000
Subject: [PATCH 24/31] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3ada96a..77e3ddd 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 Work In Progress. 
 
 ## Description
-Yaket is a lightweight and simple module to train Keras modules by defining parameters directly using YAML file.
+Yaket is a lightweight and simple module to train Keras modules by defining parameters directly using YAML file. YAML parameters are validated using Pydantic, hence typos or not allowed parameters will throw errors at the beginning of the execution.
 This allows developer to focus uniquely on what matters: data and model development.
 By having parameters defined in a human-readable format, it is possible to have an holistic view of training procedure without opening the code.
 Morevoer, Data Scientists and ML Engineer won't need to add manually all training parameters, such as optimizer, callbacks, schedulers, thus reducing the
@@ -20,7 +20,7 @@ TODO: Tests are not covering the code YET.
     pip install yaket
 
 ## Usage
-Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README.
+The module uses Pyd
 
 ## License
 MIT License
-- 
GitLab


From 870d4079917b1d6aad26c3e20190d3a7b56e19d3 Mon Sep 17 00:00:00 2001
From: Andrea Favia <andrea.favia@sleepiz.com>
Date: Thu, 28 Jul 2022 07:19:57 +0000
Subject: [PATCH 25/31] Update README.md

---
 README.md | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/README.md b/README.md
index 77e3ddd..b6b45a7 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,51 @@ TODO: Tests are not covering the code YET.
 
 ## Visuals
 
+The YAML file contains most of the parameters used in Keras model.fit, such as epochs, verbose, callbacks. Below an example:
+
+```yaml
+    autolog: False
+    optimizer: 
+    - Adam
+    - PiecewiseConstantDecay: 
+        boundaries: [200, 300, 400]
+        values:  [0.003, 0.0003,0.00003,0.000003]
+    batch_size: 64 
+    loss: 
+    SparseCategoricalCrossentropy: 
+        from_logits: True
+    callbacks:
+        - EarlyStopping:
+            monitor: val_accuracy
+            patience: 2
+            restore_best_weights: True  
+    verbose: 1 
+    epochs: 1
+    shuffle: False
+    class_weights: 
+        - False
+    accelerator: cpu 
+    steps_per_epoch: 1
+```
+
+The usage is very simple using python:
+
+```python
+    ...
+    # Define path to yaml file
+    path = "/yaket/examples/files/trainer.yaml"
+
+    trainer = Trainer(
+        config_path=path,
+        train_dataset=(x_train, y_train), # Union[Tuple[np.ndarray, np.ndarray], tf.data.Dataset]
+        val_dataset=(x_test, y_test), # Union[Tuple[np.ndarray, np.ndarray], tf.data.Dataset]
+        model=model, # Keras Model
+    )
+    trainer.train()
+```
+
+
+
 
 ## Installation
     pip install yaket
-- 
GitLab


From b13d7b876c1d8dafc8165fe775fda4c8571be1b0 Mon Sep 17 00:00:00 2001
From: Andrea Favia <andrea.favia@sleepiz.com>
Date: Thu, 28 Jul 2022 07:43:38 +0000
Subject: [PATCH 26/31] Deleted yaket/README.md

---
 yaket/README.md | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 yaket/README.md

diff --git a/yaket/README.md b/yaket/README.md
deleted file mode 100644
index e69de29..0000000
-- 
GitLab


From 927f034ad19d092b1176c4b569578794cd2b0e20 Mon Sep 17 00:00:00 2001
From: Andrea Favia <andrea.favia@sleepiz.com>
Date: Thu, 28 Jul 2022 07:43:50 +0000
Subject: [PATCH 27/31] Deleted .vscode/settings.json

---
 .vscode/settings.json | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 .vscode/settings.json

diff --git a/.vscode/settings.json b/.vscode/settings.json
deleted file mode 100644
index de288e1..0000000
--- a/.vscode/settings.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "python.formatting.provider": "black"
-}
\ No newline at end of file
-- 
GitLab


From 9cdba41b9ccb5e7c534eb6079555dea0368cb04a Mon Sep 17 00:00:00 2001
From: Andrea Favia <andrea.favia@sleepiz.com>
Date: Thu, 28 Jul 2022 07:52:33 +0000
Subject: [PATCH 28/31] Update README.md

---
 README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/README.md b/README.md
index b6b45a7..067f013 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,15 @@ By having parameters defined in a human-readable format, it is possible to have
 Morevoer, Data Scientists and ML Engineer won't need to add manually all training parameters, such as optimizer, callbacks, schedulers, thus reducing the
 likelihood of human-induced code bugs.
 
+YAKET allows to:
+1. Train models with tensorflow default optimizers, metrics, callbacks, and losses.
+2. Train models with custom modules that can be defined in a python script whose path is used as argument to Trainer class.
+3. Quickly use distributed multi-gpu and TPU training with `tf.distributed.strategy` (Experimental)
+4. Log training parameters, models, and results using `mlflow.tensorflow.autolog()` module. The run will be saved in `mlruns` folder. 
+5. Save the model in a particular folder and particular format (i.e., SavedModel,H5, or .pb)
+6. Convert the saved model to ONNX/Tensorflow-Lite for on edge-deploymnet or faster inference.
+7. More to come!
+
 ## Badges
 TODO: Tests are not covering the code YET.
 
-- 
GitLab


From f1935c2e77a051ecf63a2e5f5e2a6b86ff336b10 Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Thu, 28 Jul 2022 10:17:17 +0200
Subject: [PATCH 29/31] Remove unused imports

---
 yaket/trainer.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/yaket/trainer.py b/yaket/trainer.py
index aad0d38..a98a747 100644
--- a/yaket/trainer.py
+++ b/yaket/trainer.py
@@ -1,6 +1,4 @@
 from dataclasses import dataclass
-from enum import Enum, auto
-import warnings
 
 from typing import List, Optional, Tuple, Union, Any, Dict, Callable
 import numpy as np
-- 
GitLab


From 232c8fda99934d26d96cdc9aeef862ff8f1cfa32 Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Thu, 28 Jul 2022 12:08:33 +0200
Subject: [PATCH 30/31] Upload to PyPi first version

---
 .gitignore                          |   1 +
 examples/Untitled-1.py              | 175 ----------------------------
 pyproject.toml                      |  28 +++++
 setup.py                            |  17 +--
 wheels/yaket-0.0.1-py3-none-any.whl | Bin 0 -> 9943 bytes
 yaket/__init__.py                   |   1 +
 yaket/schema/schema.py              |   5 +-
 7 files changed, 36 insertions(+), 191 deletions(-)
 delete mode 100644 examples/Untitled-1.py
 create mode 100644 pyproject.toml
 create mode 100644 wheels/yaket-0.0.1-py3-none-any.whl

diff --git a/.gitignore b/.gitignore
index f9ac6b7..74bd3c5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@ yaket/converter/__pycache__/
 *.tflite
 examples/~/
 ~/
+dist/
\ No newline at end of file
diff --git a/examples/Untitled-1.py b/examples/Untitled-1.py
deleted file mode 100644
index 7a034de..0000000
--- a/examples/Untitled-1.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# %%
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-
-import tensorflow as tf
-from tensorflow import keras
-import tensorflow_hub as hub
-import tensorflow_datasets as tfds
-
-tfds.disable_progress_bar()
-
-SEEDS = 42
-
-np.random.seed(SEEDS)
-tf.random.set_seed(SEEDS)
-
-# %%
-train_ds, validation_ds = tfds.load(
-    "tf_flowers",
-    split=["train[:85%]", "train[85%:]"],
-    as_supervised=True,
-)
-
-# # %%
-# plt.figure(figsize=(10, 10))
-# for i, (image, label) in enumerate(train_ds.take(9)):
-#     ax = plt.subplot(3, 3, i + 1)
-#     plt.imshow(image)
-#     plt.title(int(label))
-#     plt.axis("off")
-    
-
-# %%
-RESIZE_TO = 384
-CROP_TO = 224
-BATCH_SIZE = 64
-STEPS_PER_EPOCH = 10
-AUTO = tf.data.AUTOTUNE  # optimise the pipeline performance
-NUM_CLASSES = 5  # number of classes
-SCHEDULE_LENGTH = (
-    500  # we will train on lower resolution images and will still attain good results
-)
-SCHEDULE_BOUNDARIES = [
-    200,
-    300,
-    400,
-]  # more the dataset size the schedule length increase
-
-# %%
-SCHEDULE_LENGTH = SCHEDULE_LENGTH * 512 / BATCH_SIZE
-
-
-@tf.function
-def preprocess_train(image, label):
-    image = tf.image.random_flip_left_right(image)
-    image = tf.image.resize(image, (RESIZE_TO, RESIZE_TO))
-    image = tf.image.random_crop(image, (CROP_TO, CROP_TO, 3))
-    image = image / 255.0
-    return (image, label)
-
-
-@tf.function
-def preprocess_test(image, label):
-    image = tf.image.resize(image, (RESIZE_TO, RESIZE_TO))
-    image = image / 255.0
-    return (image, label)
-
-
-DATASET_NUM_TRAIN_EXAMPLES = train_ds.cardinality().numpy()
-
-repeat_count = int(
-    SCHEDULE_LENGTH * BATCH_SIZE / DATASET_NUM_TRAIN_EXAMPLES * STEPS_PER_EPOCH
-)
-repeat_count += 50 + 1  # To ensure at least there are 50 epochs of training
-
-# # %%
-# # Training pipeline
-pipeline_train = (
-    train_ds.shuffle(10000)
-    .repeat(repeat_count)  # Repeat dataset_size / num_steps
-    .map(preprocess_train, num_parallel_calls=AUTO)
-    .batch(BATCH_SIZE)
-    .prefetch(AUTO)
-)
-
-# Validation pipeline
-pipeline_validation = (
-    validation_ds.map(preprocess_test, num_parallel_calls=AUTO)
-    .batch(BATCH_SIZE)
-    .prefetch(AUTO)
-)
-
-# # %%
-# image_batch, label_batch = next(iter(pipeline_train))
-
-# plt.figure(figsize=(10, 10))
-# for n in range(25):
-#     ax = plt.subplot(5, 5, n + 1)
-#     plt.imshow(image_batch[n])
-#     plt.title(label_batch[n].numpy())
-#     plt.axis("off")
-
-# # %%
-bit_model_url = "https://tfhub.dev/google/bit/m-r50x1/1"
-bit_module = hub.KerasLayer(bit_model_url)
-
-# %%
-class MyBiTModel(tf.keras.Model):
-    def __init__(self, num_classes, module, **kwargs):
-        super().__init__(**kwargs)
-
-        self.num_classes = num_classes
-        self.head = keras.layers.Dense(num_classes, kernel_initializer="zeros")
-        self.bit_model = module
-
-    def call(self, images):
-        bit_embedding = self.bit_model(images)
-        return self.head(bit_embedding)
-
-
-model = MyBiTModel(num_classes=5, module=bit_module)
-
-from yaket.trainer import Trainer
-
-path = '/root/project/yaket/examples/files/02_trainer.yaml'
-trainer = Trainer(path, model = model, train_dataset = pipeline_train)
-trainer.train(1)
-
-# # %%
-# learning_rate = 0.003 * BATCH_SIZE / 512
-
-# # Decay learning rate by a factor of 10 at SCHEDULE_BOUNDARIES.
-# lr_schedule = keras.optimizers.schedules.PiecewiseConstantDecay(
-#     boundaries=SCHEDULE_BOUNDARIES,
-#     values=[
-#         learning_rate,
-#         learning_rate * 0.1,
-#         learning_rate * 0.01,
-#         learning_rate * 0.001,
-#     ],
-# )
-# optimizer = keras.optimizers.SGD(learning_rate=lr_schedule, momentum=0.9)
-
-# loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-
-# # %%
-# model.compile(optimizer=optimizer, loss=loss_fn, metrics=["accuracy"])
-
-
-# # %%
-# train_callbacks = [
-#     keras.callbacks.EarlyStopping(
-#         monitor="val_accuracy", patience=2, restore_best_weights=True
-#     )
-# ]
-
-# # %%
-# history = model.fit(
-#     pipeline_train,
-#     batch_size=BATCH_SIZE,
-#     epochs=1,
-#     steps_per_epoch=STEPS_PER_EPOCH,
-#     validation_data=pipeline_validation,
-#     callbacks=train_callbacks,
-# )
-
-# # %%
-# accuracy = model.evaluate(pipeline_validation)[1] * 100
-# print("Accuracy: {:.2f}%".format(accuracy))
-
-# # %%
-
-
-
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..8643b57
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,28 @@
+[build-system]
+requires = ["setuptools>=61.0","wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "yaket"
+version = "0.0.1"
+authors = [
+    { name="Andrea Favia", email="andrea.favia@pm.me" },
+]
+description = "YAML Keras Trainer for quick AI development"
+requires-python = ">=3.7"
+classifiers = [
+   "Programming Language :: Python :: 3",
+   "Operating System :: OS Independent",
+]
+
+dependencies = [
+    'pydantic','pyyaml','mlflow','tf2onnx'
+]
+
+[project.optional-dependencies]
+tensorflow = ["tensorflow>=2.4.0"]
+examples = ['jiwer','onnxruntime']
+
+
+[tool.setuptools.packages.find]
+exclude = ['examples*']
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 388beef..2c3862e 100644
--- a/setup.py
+++ b/setup.py
@@ -1,14 +1,7 @@
 from setuptools import setup, find_packages
+import setuptools
 
-setup(name = 'yaket',
-      version = '0.0.1',
-      description = 'YAml KEras Trainer for quick AI development',
-      author = 'Andrea Favia',
-      author_email = 'andrea.favia@pm.me',
-      url = '',
-      packages = find_packages(include = ['yaket', 'yaket.*']),
-      setup_requires = ['flake8'],
-      install_requires=['pydantic','pyyaml','mlflow','tf2onnx'],
-      extras_require = {"tensorflow": ["tensorflow>=2.4"], "jiwer": ["jiwer"],
-      "onnx_runtime": ["onnxruntime"]},
-      )
+
+
+if __name__ == "__main__":
+    setuptools.setup()
\ No newline at end of file
diff --git a/wheels/yaket-0.0.1-py3-none-any.whl b/wheels/yaket-0.0.1-py3-none-any.whl
new file mode 100644
index 0000000000000000000000000000000000000000..34854da650d68d1a88b732d4cb60b1b3dbf2b0ed
GIT binary patch
literal 9943
zcmWIWW@Zs#U|`^2xbFQYWNo-#tt0~jgA4-$gCGL~LuF!iYKeY)d}dx|NqoFsL8Z6$
zsq>*1v^BlXoblETI&nt#guYfFM`Hma!klA{e?n4K9;8nYW?*pjXJ8P3n^RJhn3<Pa
zq*qY+)+W1OdgsLZf3+*-@@xNmB6r?m_HJnhM;>cFo*UW9-_$HMn%(+cGJh?L&)jr8
zi0A+JkdJ?}JU8#&S>wt4&Erkd#)y0|&)oKPinAw8f4wWE$0*3O*1X{MmnSF19Dgia
ze`{{t?{!Hvj&@)6yt=h1Zo>KG_{*$6H@3d#{<O(f=fvwQUWN6icIoA3|8gnKIljz(
z!la4YuKUi}wnlne>37!KKf|`azgG0bxzfM#+_my8npfq0k3C<wCQ9LT?e>KZM;GS2
zYAScFzcEF3{WXV^RleC)Q}#&5q`Y2NyMTFl^WP<b`!4a$;`zCJU0$hku=3hp(Z|o&
ztUBg1xp({D3u`r#E!5MVo#H!iUVZ=OGot@bS6+Ddi?fm4<5t-2MfyMQnkLo=^zxqj
zAhl!i{lg^@dTBcM3f@F3xVin@Dss^4MDo>dmAf_D7qS%IIB(AF(k%Pw`ljo>4qpPU
z*$eYVPLkg>jVs5Y;^fPB1-V;NS|2PhcP_OqnpGdf5Vw9|j(KdbftUI9C0UkxcJHbB
znC%<SdiWyK(Hh^p-Rs(}&u<Sbi|y}m4o|oe5xne1`mwgevkpvu!aL^iedz0a-<tA!
z>-u25T-U4Ll=W}K-n^W>@}u^FjS7|XswZnmaZQdf&TzJl^o*0bxN`f?>W|lFojBX8
z#Io=HwAa;Ty1QrhJ#+o%Rr{&^ZNetsujLn%SF11UeJcOOeSPp<!(~Tvs}hw{e_H=|
zeBt@m&o^pw8*FA+E9{qcV7(x|fTJawqoOrO@4%|--)q_5%$YE=L3gdGbwXe7eU{+c
zlBSi+Mb~ne6r~=2DoJwQuJN=>JVxqC&&4Wh_49Sw4NEE?O-{WT&9K@*`M)3M{>3h(
z4dVVhAzM{*+4)*RbswkhUd9xA&m%qcE8~9c?s>Njmsp1MOuGO6dWcWiG|q@i4DV0s
zsxI1J!h7@2l?{E-kq^#1Un0$taJ(qzjP|y*SyrDpII3q&U(Qjh`b*$V@(JCgoWe40
zdp>rGC1-ADe8T-`!h4oEP8LgNc-;-&we-gMIljBz32e9-aP96t`Pk2wdCF^UGR-}^
zvhcsg1I|Oy)3zS+ee1B<ukZQK<E;UIf}Q_)mI_Y1JW=@DQUkLEGMP?Zobj)@&d8T~
z$tTM^f4%hTS&43TiGl>Da7lUTMFmSr!%P<WZMA-I+>&Q!U+9(zkqwWnGi^`uoG!f?
z9;Nt0`{!1DcDK;SIxVF#swD+7Pfi)WxYM|cQ$Qj%Xq&#ugEP?+moQISnB~4RxA#O5
z(~9edGp{#ugzL;%m2lyZf3aBJrcDZ4-d`*CQRi9FSbRRbdxGAbEnlXtn|$p|Zq<xM
zLY+&aT@8HKR?N-UewY2Y?q%@vy*4@%754iI?O~~ky+7ylGs~>0e>&V$Z4_8IR^6)c
z@d{XQJn75Q)8hOewmo^hyUB5n*M(%4Z9FWmn+^S{1N^K+nx=I8d+_(pEcZ`)pFcjz
zBRnOsE~aMT)pvc~hYU-9|9SYeJv%7R?B4GN<B}b3C3-tT-WGG`9<>dy<6Apz*HxEr
zLHU(AmX-bYw<qs*(-geNnfE-5IiAltZJ$l^@tVT6s0n^gt?#MSOU#b;cwx2Jpdn6R
z_T=r+C;dJxU6p;4L;hg5VrK4*{;wsmx!N4IT@_FBE_{%U5?=QBsrN)>sY>?D8#)h-
z?t0`es(G{T!P>|BKb6}5>bttx=KSh6-OZ03xBOTud&b#$))A)n=MD*0aGU<hJfFI8
zfhbF(^X8MPUgqbX37hp@;8J6c6lZ?>IxE&_LfApRx3g?iUvk>c+HOB}e(>LxE~#5L
zcO9M~!2WUmxmOO(kv|1?*Zw@!x6tnCEyEuL*=MG9>EA6l|EtJ1efi(cRaPqsZA-4{
z=I&moDcTdaIwkh@J7ez4Uk$c1NuMqHS^fO&iQfqumBR)2*w%C<{K!7=r%S5mMs%dN
z#e@B?8ACdc{Id9BrFt{uwyvy3fTHI4lc(D^O<L`A^UxX@{o+p#+vWcy?Tdf<Wb+E<
zy$7u>?C!c;EVINVd$r21O66Ov>wPcGQtDCQFMJvB_<~e_%$#3SCH~eN=@GcQ_B5+$
zjGbnar-)F%&)dR>K2G@av(M?jWBS>Xmi;{0<(JlSr?5<XDmQQY91{sw!`H4ZOCL@&
zIk~RUH9EHFi3GP((?6blwn{fYzOH(@HuhiV)sz3rlrsxWS&DZt8?(wfXWw1nc{fO*
z_0`QsEKg1DYFcHBYkJ=kon@Qnw_IG)@T;1>VMor%g3Q&ebxs|^Q)g{0S!=<5_n5*<
zb**b{)4wg|Px@N$ve<}m8;|;~4i90Yh+Vs8dbO?6?R>Rlq4qm=pY_{rE-&modhTt+
zb@t2N9De8K*G#S7Kk<z3hr>_er$q0`dBq>{YGZ>^(MyKN?mw$)suwgqbTa1Pa=d6X
zLtujE0e%?+-mdEBd-g2feA>lEXv4o>X=<0(FaI60WsTnY{64l{nLLJytDl=|uF<;J
zvO=cM`k`LhH+2)E*QR;=OWvL0y>&&m_2~xPONz7C)(f!3*>${SeAtlEDq|aT*Kw0}
zw(Y(NOQhP5?tiJ&wE2(ItdL{p(=u)9yb{|EHzh<B2^kxuPmBGqZ2tjH@x;`#8!B@g
zt%{~u>Nr@Wtl7hK>`-?kPh9RdhJW#850yBM?ws~@`lkadrz3A(Sdq?Im&?6l>rtk<
z39q-EKHFWmBCO&6FE!ishYZW*Z`M!BeRl4XN_<VtKb_><W6P5(nZIpX$!6AkKH<%4
z{n<BV+Ro{vTr_UplEZ&8@Zw#4!@C`o?RiqMJbCM${J5VZ$Lt#X)&2IXhkE-y<sa2I
zd|t87zI>hf^Pk;kK3@5=W`4x=gr>=Tk3{#C+OQs$VcTOfW4}kU>!}9rlaHs~*e<mv
zTQO(B>#4;8b`Gq?ffB){;m<C8P?;xEpt@#N`{xO76M2fnE4kApr}nv4Km4e2Fy^h8
zqdFVo`pMntqF)pO9&UG;F{!&!jZ^HQM_&34Q)AZ2muvmmFLX?MTy$)c@bzmi?zj71
zEZ?x}aBJ86wv0P^ZGTitez)wk>+<{AZk|8qf&9;ma@pyAw^Po13|itM^6gsE@#GEC
zN9W{OYK1P`VH8~7e^cd4Z^WZbuD$cRm-xSP66dLKigYP@Q~YFBWs#(|L!YpY<h&Uc
zJ+m+PZ)f|O&#P*0fA#ij(T4etY{O1}<LmazoMIki<#}=GrQL<Q*9tp|w%>kzopX(u
zaZR0ZKgZqUS+l=0-wn#yGiUycJ(te!opSK~uZ7GjKQ>2SJ8-u{A#TUBHQ$8qI6Mx$
zxBPo_yyuR^p?}WZe7H?i{j!v3MZxUb%tzZePQ7~;?Qm5rxAVk}RnJ7$aGFY<Zn(jE
zFmd{{C9Hujrmti)-Dl7?b5_EEe=Do9w|Gupv-jA9K99d2{_T@ui<ppax3qh5u=0+p
zcVi>&ipt*Rz1v+iWrbn#wcs3fFYX3Ch3~#8cRYXDZTVta%aCls;&iD+xlUBM=Co3k
z`ZpQ>E6X3-R9M_yf0I>KIQvX|qVIox`TCa9!va?_#FV}sa{7NlqmtF&f4<RU&6z*7
zWi!7#R`|9uCE%5N@DtnqOXub5eFP+TCttrkLvYFdCXa)wO(QPFZa3T}oPF8lFXPmJ
zR__-10HM4CS5lX&p1jqb6+HEPMDWD)^Z(2)J^aA4L0|e-f}muz^X^srYi~9mRa8{5
z-Epq_k?QN>gO{AH<)vj^nDe#tXyVBU`7fkmF6e0A+_h@c&Z>!jx1N2r`gQtr^9X&9
zD^+(Ik1n&{Wo>iU<aYKQ3ICQit5{ovJl<tmo2_6j5u0SY-C3hhVcE9!)Yt18Rn+8i
z-&GmsXWw>dnf_L>OY>pl{wto-Uu3jchZN1Xj(I;LSSabQo8s*YT?%`*@>*-u@7Cl~
zZaD1qc12|8=Z6zCN_>v#$}914{rjyjConK-t7w?btQl(#*4cge^8JNG68jF((8w?L
zMYUn8l7$b+S_&_ls^N8dvGEF%F7_hJ<JX+8e%`<BiRse^2XC1E^xFT^J2c!Y__&%_
z)`vy9(pslp_uWuBx9Z&nm+jgag$G!o1P-2Y6f5|$K;iVh<!{rTC4Wj3wVtMzd5cT`
z0LMYLuh(sy7kiyJXU*Hx%2>-{8a`)2zQ{p)#{>DSQksu))&8w=3|JN~duhMt#T}Of
zzMA(RNnqEWbgHSrb2@J^-`j_s4)3?j+fZ2B^`S#VQN=SweY5a}hDo0jr#)Ed{>XCa
zn&XS+*w$*T{FBc8TsI=tJ@u%|j^!)%<=yQlT<Y%Mu6B5SY2hb>iU;TB7;rqEmV9<;
zy!gQlhxbm{c()?k`{9#^U)jDKh!1>a6Rq2F=7jKTyNdk+&r_aHoOfl*(d)&VVkUV%
zoc*^hq2|+!Dy8eY*p0FdXgxTP^8O`bOlHdZA2uutj28JaJpNfaMP4s(i<#<?jSAnn
z=TGh2^y!-5>3W^o4;5}s2bV}iPp$YBa$4!w{B11ky>S~|-W`a%wdA7zm2O6{t5UI>
zC-`w4()nQWeUgY;<gT;(UMcQNWvo`#QU9^mru+WKrY{oPCpoieRdZ_X_FS!F*B=mF
zBe!(dk2x#%?QFC5o_N(q_D;H?xcp1Q|1A!;66<PzEZaWu%R}F0OLh4TmvXkRYcUtT
zqQ0&2!%Q=#nk{AP_pQ#fOqja%K+6y7o}9D*gH7!wMTX7AK9jC-E`Pq<x@*qkXNIX_
zp6%6l+!nPyO0Kzhv+jlXH^;gZk(V1eyV7GDJ=oj>-DY)lFJpFMZLbhL{_$+&$6_TG
zs~N#j3s=QV+<8h;ujBgPj+IF+t|95&<`H!gm0hRJ|5&qqTX~IHj{V%!4bI=UEUlRA
zeWdDR9INa@&bfTS_ktT%sF+;LzSKESm1D`%`3h~z{bg6lPy9IV@L$%C1?*vI{ms(B
z)|G2DGt<fvYqi*UvtDj{d{a8&-N|oP&pz%E&Q{TIH7+imrm{r+RD|r&uLeHLSAJO#
zdf|0}&WGuT`<7Z)&YHdIC9j3_lWjIfw_Z)td8fD6=(xVxsbv>4`>#Bkv#en1%fe04
zGM;BdEaZ5ms`6#LOqV<F=zrD2z{suoi)_11R%hnnnKF+m7QO0`?)bN{a@C~ud#^2%
zx4H0CxcpXo#N&{y{ZDjiwmit0z*hJtUO;&6XZ{C|E$1A5!SbzS)dGgX#|hJ3u08kj
zW!;Y(%4c5Ae64=Rz>aGTN0ekz-UZh+Mlbj+!?!Ku|8YF-?TJe#chs?XKc0Kwq}}C{
z86H<B$s87ac`Rjvz%-STM~0`*eE!HTw%Cnp^-;EXfrTZKSNr|b@89xe(`Kzx(R!JC
zbh6>ev<VeeC*HZulbmkSV-crp)&9|AVb7w8ZSz)bb}WzE+UWmjam3;sqUlS2?3kxj
zqh9UzJJWSb#J&^fFIJ__^Rj!~C>G~3Wyhzk%V}B-c3oRoLQmPcox6Fs`j+YLR||BH
zxE7v!5^SNi)`#2JAZ2cv)r!8{9h?@Mne=~z-LDFH&U?tpZic|E#hWwA#n)SvAA2=#
z<BDr7SC4c~V_Kiocw=AHnG-sfbSi7_*}Kj?Sr{AO+R4A{ZUgi3t%n)vjOTw~xA>s0
zno_$X%QyGfg|(lpF1yA@v6d@*Wjh;F*Yt08)`wl|pG=n)oW&|4C%*Cq$0Yf*kK5)R
zdN-H*%lln2mKzlF1QzR?Tv%tqV}C)p`PS2pN{xnA9{&HkvrB6`bE?)%7gE~w@xwpa
zICaZMEuvZX4XzxF`@LI5ibu@Wdgf81!WLogLY~kf&AK>+t-S3aE9F;i5I*Gm?^9WY
zcPMAl!~b7eZ>l$ngy&CQvizSXt4z24jCIZ5U&!VPcm&!sEPEV#sp*f{q7$M2?0<gN
z`6lI7R;I-C|IE)h7N@e`@7%R~Z-(|ChwI|fr(RwN$vrx2vyJ)n@;9uq%Q-Wux#bT@
zJ@hbbnVPeO_s^_r4ht^ne=(oH_~K8;%GsJ(vx8+rCvFeEyLaL(OEnSm<(JJrTTP9-
zwXfoddCl%<=F_idb=is;yPBE#TFNY2&v@c{)#TfAwp~(qr<ugrbz^=5pXj0*##_zW
zTNh;%ShWT%IKt16xH;zZ?fDEr2~PWi7=k{VM8wKDB!AW4bz*+bP64NhyIWQS3h~}+
zx?*3rMP>SVRo{Q5-K+n{3H&Lyye@IyQN%=D<*6(Co!v(-n{A&`zi>;adG<%qu7(9h
z9p^Uhwb`^&;nkaGHHx=1{uNzv{4@9Pf=d&&NIm7vu%8|-vmrN4<0s2a?H^y3-3eUw
zzGv+`=C$S-53Of9tW^BCVE&&e(|hErtWGzSs@4~>o}R~D^!4v|DJjjkRs8OWjXR&(
zt}!}xP(X2Ais!wZb3VUhn9ZDh{C3D}E+MVMlheh5E(tJmxn){ZUevhU@wESgL)Oyg
zno=<V?^|!YJRNd<(@Vu=sf-3j2U<lpnJnmH|MhCdpR3E*H>}PRxU^$!)E0@G_ohfZ
zTp7~boY*?$%Zb&qo>o|M_p6mML|oh4oYuW~Y4P3IPtPmz4{ER9{NinloZag?*E>&d
z|21LnlBdFdZ*%DF(aNsR+~XCpQsAGi)NP*+6)z>?3Z(huGp}04t=nB?TV2??bjHG^
zTmSrQy4lTI$#_iV-JOoFAC06lY|}2!xHtWGe{8t37sHRLA4?1vpRNAs$yE3Gb-gHK
zepW=}1+zVdFC6yB?b^q@&F=hudnT59!eS@GY-?^sZhz{#lmBG-`OeP%iKmadl}=OJ
zEwQUF{)=<FxT)Te^J`)+>@``yt@Csv_g7^`OSwZqUxTiu_?Jh87WozG{?+b^n-Qb-
zXH8>?(letkd%5G+KHchhd&Sy>%)0Owohhx(+jwpq=&WZzjNb9){|sSZU|?VZ5ey6r
z((uu{<ovv{)S{BqB7LOsKxQPZTvb0q);cVoFUrEeaEAvnhzJ=#L}-Px^a?81hI#kj
zHWS$QJDg{VvrdTG1+l#n0TY$Z&bzeC!ECRN7?W;rwwaLTgb<m-H~!y~49d^kJ$r%8
zn(!y)@AKQHgs$yn4*#=c>aSfB-!2abo4x2t`U5e3^VW-^JG^zTGH;pj{n9QLyDNTK
zM^Cfl&E6%#SrhbQ?t%PQlMcK$az7CrH|<YA*McV}WY2r&t=%wpS)H~*);qtYHYLmJ
zAErN_p3ArO;!cN*?Zq2spM7?A!J+@H+oNg)k8y6k&!BgA$C?Yv9_7i)?Ad%yCGP89
z_K;s1r)uTT&egt=p132m{=z}qB_EcDB{ifydBR~IZZ%QiHKTpIqSmKh5~=o0;f~hD
zu9~shG0_i=tnM|3@Tu1QNqy|!x2VBK>*c0B-DTOXkw-TCzADYX@$w|r3!KuQ7`&{n
zu$NEVKF8rnm#k#(J9fjUHgD|*`Eh2C<{IA0>3%W!*P1Vpfib0z7kIxZO?zb6CC>Nb
zgF{sQyc(8ueFw$0r<TS%O|n)Ho*|}Gcg>u?dDTWfz7;iU^A9l3+NCt{c-W(v%d_8=
znEEiR7H>8?zhK3l^Oe??OG91coxD7cYjBu&_O`ou>V91q;c{u_u>`fM$BjC=zHF0Z
zw&?wztfKkp^5<l!&i2wZr{{e8a_89PjMnU)rs&HZb@5aDZ*bJqpYw9qd!kSDzj}9j
z+3e$Qf-Prs<w;Gw?7^^ncS-VJ8)oP1%Z#2*$t#!aT&MM_AxO*c%3D$1U0%EX-N=@n
zFxxU(ohQLb_HE>@%Vu23s&OZ?*^gBk{r$%nwKDG5%+nzU7tLILcv*B}(bhY@iJs+A
zu3K(bCQScPcgNO^S^4+blP?w+Ig99iIo3V**CW5E)CfNBqn;1b*Z)q6+}&Ed<-(dr
z{}cL`mi(FWZXJ(imE4xN<)`Em!%MGgzSby^ea<9keR_Y#V?C7*Rw|dLH=K(Oic{ZO
z8M7;RvD5b4xlzY#?@T{qvg)v5bh2`8@|;xlz-cdek0&fYbeiwetE`F9Jb&}HPO-?!
z|GWF4@}X<1eZ94Mbhy9Iu`QHJeY`e#MqXFAlJ25{qN&;oFGsoga7&9nanZh8;$$lN
zZK1S(-7~|omCx!nUcPobW4Z7N1J2D-##Osh_FcZ7EPp@ipsBXJ$nlq|LGy1ck#*g7
z?d6hB`U@(wy(>Sf2KD^=x-&rJ---Hf2^F<Tj!BZ44?<i*w^Y|iRy+@@(mHTU`t)tp
zFE{!fJ!;OnzI`zF|3a}Nl{>5!9sRW-*S+34Z+=~vj`~FHw&xSqODX%l`S#5A-<R`8
z0&MS3Z8W?c8GOuQ%HNtJfp7en?1W5g%=#{M+{rASc~jL|_wS`nY1vda^P8LQ?>m}*
zepl=LkL5o;Kk8n(cjNw_S(#6dh0CTHZP<KYMY^Jv!?Zqhubq6{@gsASvbHCrJO7bj
zm(2giiYST1^H57733y3VoSc!Gn~19zy6y2N<iwJHLCwqz4BI&v7{t)^LA2`?R89>I
z%)f0WQ1^YkMFsyPF5er!G7cQRbVQOVCwGEb<w>r_X?|vgT~}OAd5S&R|GsZZ%1O4U
zkOtmQM}EJ1xBLByRi>sprn|k8J+aGhZL@ad4iP(3KPlZ1F7aQgt<}D}UF#pYUdf7(
z-n~L%aiwN!*(QmAN30h<J4XCDJ!M7Eep?Pz?}t9p+h(U<@7l1(G|Z~<fb`p@g$7Z6
z*Su~$K4!(_;S_Tv*7nGq3s$TbEjLH#aH+0o?Ox>2s(ZZWXXz#`1z+~;dLzF*iRD=<
z&g;Cp@?kfB;Nsh}FG+i?X%Oz^PW|ohXUmISVeRHyGm=za<!Ua7aGaPKn9No4&w7UC
z)6dO26tp`|=rXNj7uM<)aCJU0ak>852`)d*nQPl^<DF2w<=fm-EkVy#`ZrVtq-}3n
zYuHt>Q>^99eTg7H-T#NnL_<DHSczTkpR)DerLq}Ioc)tuZaC-Jup@2m)u87&8(V&4
zb8W4V+x~v?`Y-aaQx^3-EBxfPWzn|Pft&kv9=eBdc7NCYU29^M(%!52t)xuJqVMVb
z+*a*xJ5|qY{?1@;ICpAh_Ctp#1zl79d)|`S`y*d@nmINcc^W42-)FPka+QWnuQ}NQ
zjSaT`Ywh5i<jVJ3xx-lI(eBHi?(_ZLd)_Xk`aoH>&PJhSM)wrwhwZ4F-}B^VZFne`
zyW@sxjj62D>=)j7dw2Wf&*o=U7A`a>lzFuCzvlJ7m9IB!taLv0;qJU2KD(?|DV@1n
zVS4nUO3aiVXN!%`jvqF0)09zj_EfUmU?=e_V{Xsu6-Pd9G3!$=DGE)UH{;2Xc#m0k
z*%w;f*`j-N-J<J<XJ1!h>?qjjclpe}YumXpr!0JYZ|^+zir25L)#~Q8?XM6L*!88b
z=WKv{N_NRMXL0xG-^0a!9zOegHh+EIr`2Z<pMCnQ_{-iii_hM@yL`83A!GfWdxzs9
zZYX6vx|#TJ!<h#~`d|3culf|dU1@XS>(R9j-v_?)v8!o*<Yw_|m;Cg7p?md&fBfA3
zGgjPa>7(C|ZkQfU*Sb?R!}Pzeer){dtsy@y?#RCUa#Q}XPgDK(RnF;pdpe~-E@A1~
z6Xy@JIXN4|-co=0?W56Sr)iTDa+X!|ZT;?<cvJ7>MJLne(_aZMN&9wg%?g8Xw<(3C
z$HHws&x-%~{+^+ik@dkzFJ8{R`PV(=*}T@oH(cV+r@DmRy0a*H^R-}$Rz7Vu%cZ4S
z?;6hs-j9(pXmok=ACcX2{QrdP6|S6H!o<L!%fi4Q3(K~;26_g1hI%QP#U;9#d1?9j
zKAz65e!;G>LEibd4S4pw4i~W8+HjHA_T91@Udm_mPn_B$BO2XlY7v&8$@}ZQyH)0H
zK8}QrC*PHVJ$QF!zm?EQUTMGI_uUQmiVXROKYxEV{;+ZOn#QS>rEME;`KgOHeOBJV
zuy^j8f4qH@B^SRHvhcaqWbbf$_R9?64!JWw*wU?8S$@=-<dr-Y<k_Neq`>H#hwvZ8
zxU(sRJD7!=^5;6*y{M_SylFVgql(@91_S@b?He-QbsT-Hb1Bl;z?|vpz376{jo+tp
zOy(^t(k)bZaI9=c$&-hBZFfxGtu0XFCoeH6j!8{nKIfd8=X_tb7O(l7YQN)x`UT5H
z_u`erm_?tPs!e%3d%nVr#Do(ME_|Q!_3~XC=WFM;T&Rc-TySdZ_vPz7V-ng=n=D!~
z*KqFfsb{<DbEH=~J!ZG(`R8>p%D${ms8zlvGiT=B`mM#&ZWp@CUwn0zN%p~?%B$7d
zU(zHJ^51hi$HuJxseaDolj5U?b0r_VSswZR{|x^X`^7GVXzzXM)sV9w?5h6EP9Cv6
z2fr&duD$X)YcZp`i2hf;RiYn?*zHqisw&i7_cGxP={mXS+|mQHbS~FSKKY4tT8Fmw
zANJEuSt?gWCp?(z)zx-_YvCnRyPrYNg5C>lo3u;iT<FpU=d)9%y;OV7^{RK(YPSV7
zZCjS=gmZ<uD0=;ESrx(jkz4-4n#y^TMIK9+tUn>YBy?}^r_{BHu}^0ncoek2c;A_$
zzFf{rEvLION$UUTo)IXzNy9DKynI6}x7I|t<MFFfLxXy@Xl*@|-lRG8UqfE1vd{xB
zvn>;*uIg&JXeAhUR;5GxXm4v6Z@gZU@|SxikD3LH4i-Pj;_wl?-#-6FJZf^4XHw~~
zXJlaTVq##B!<k%tT|*pQ977ySC&c?7G7vd>|F`Bl>0cXI4<1=!^{8or0J~JWPtBgh
z^39j*H@|JDt+wtJShmmPt>ylC^KPqF>Rd?v;&hF3v1QLA?rl!4GEsLXq_ZbW@HbC2
z_ga{1xo6pFo_8k~HK%twO>Vsu@gw6%74u6bu3M8#wko}iy!FF8Svg2Ur9DJ^n!vLi
zq8#QwO1n<KD~Vnmak0f$)k@vVaoXX<rc+&)wyaZdjx~)C{IFoVVsQEHkDUJl+Wqz|
z{(P-6NcQ7|!%pHM!4Xb#)wy(acfPQ0KCB{nJX}XO`%>Scof8s&<rvOg`{}0OKH+o4
zD{eDw6@Puds@v;ZsH%5nS?$&7%8S3wYrVK_5yz*T_qs(3Y*$51dj0u*l6c*Bs~?Qb
zDvbyEEnh5-U6<?R==wkAh5f6;XO8`g{QTg+O%=(b(?5Rrqw<A=N8&dNYS6#TkZOow
zU|;}YcozkuqzLzLb@kys@8Ns)BCofu*10q1HwPJ9F@8|wf7V;qOXsBirXUZmlRA2x
z`s^zMFBn`fHoI<o<q6lR^ZFOR>RjX1(Dgcd(q~hUhNhP9Q!Y=R?LitVOs<_hect=3
z@0rz49x)+xen7=nQ9pwL2Ll5G8v_G_GS2i+l3x&?lUkOVqgPT<vfAg2pYB6+gS5YY
z+whu^fx(*@sTYOeyC7F*{~(vO7f<&s3J_>`SZmLHqhJR|cbv?+rUgn`C)yO`Se~er
zSN`%`wl(?E<GirY)%x|T-W?R(I)C4M5zp7ZH^1sLEo6}R=3O$`=8b#Y<TW2-_^#Vb
z^sN!JT*WeNdE)kc=U;jmp7Orq$WtKyeZOBcyJzDo@nwOnZ`QRt6vT+mEorkbm~AJK
zv(S#=d)pBs84afx!8E^w1_qn26Ao>95qNjeulU?AF_YG6Jr!$`eelokwCtRVu2&*6
zs?W}x8u_wsdC<nYn$<l&SpVhdEBW{-Z=KY<ck=scNw;aoA9&4;?Wz01b6fF_NXWMv
zmybW}3T$BK-@=(Zh5f(w$Az{1;@WE^rv<hYL_PkJ8pTtyNzPk{XU+WnWf2kra)Aqe
zm8SAZFo^s-`|wE5jES3zFKwv^a(TLZVdtk))8=QTR4flZx{3Wlsd=@~^MxkonOx<U
zuUISb;_52trL&eDXFTH3^z{CUWqX>1?i^IF{<iaU%>%`1(L)yly#LmCJ2}U2Mp@|Q
zW?m|fxn`yLT+XNG;SKXun<u0`_FSd1dx7H8g_llsm)6=`ko|OJ>D*(UPy6nO6uBPP
z&bl^RpnBi4={KfLoxe~nb#qxxecl9<B^N)oKX$uov539keUDFTTXkjaPQAqX+3A6s
zqd%l~AHKIVFST&*RD(B<`2)NenM9aH7;vrDVSs`qjUWc3>y6Y`N7jyQkq$@`2rp^$
zfF)ji#F8Ck-Pi^RKsrHqNh6yGY~c{Rw~yRbM>YuCkO9aL5MI*gh{Yhdk$MG{$R=Uy
zEr3h`;U$d|Bw$X0H2!h92V36(WC92;X*_{$3dArpmw-wF^o9&bKL{^rte1hh1S2~`
zn>EOu0vUx~$$*Rj;U$fF^0<tGRXONpp_iH<b3k}W<4OfwW`WC4bc4`KI*=hCyrj`i
u5tl(I<sQ0$=p`P=Fc99t*pAacNck7w%?c`S8Mqm^8AO>G7}kJFUIqYBviL6m

literal 0
HcmV?d00001

diff --git a/yaket/__init__.py b/yaket/__init__.py
index e69de29..9ec9ed0 100644
--- a/yaket/__init__.py
+++ b/yaket/__init__.py
@@ -0,0 +1 @@
+from .trainer import Trainer
\ No newline at end of file
diff --git a/yaket/schema/schema.py b/yaket/schema/schema.py
index b2eb881..2e8a591 100644
--- a/yaket/schema/schema.py
+++ b/yaket/schema/schema.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional, Any, Tuple, Union, List
+from typing import Dict, Optional, Any, Union
 import yaml
 import os
 from pydantic import (
@@ -9,9 +9,6 @@ from pydantic import (
     conint,
     conlist,
     constr,
-    FilePath,
-    DirectoryPath,
-    Field,
 )
 from enum import Enum, auto
 
-- 
GitLab


From 1a87537d60e7dd789167d994f463c5f3b4cd2319 Mon Sep 17 00:00:00 2001
From: andreafavia <andrea.favia@sleepiz.com>
Date: Fri, 29 Jul 2022 09:56:32 +0200
Subject: [PATCH 31/31] Remove outputs example notebooks

---
 .gitignore                             |  3 +-
 examples/00-simple-mnist-convnet.ipynb | 13 ------
 examples/01-asr-ctc.ipynb              |  2 +-
 examples/02-bit-image.ipynb            | 58 ++++----------------------
 4 files changed, 11 insertions(+), 65 deletions(-)

diff --git a/.gitignore b/.gitignore
index 74bd3c5..6024e06 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,4 +13,5 @@ yaket/converter/__pycache__/
 *.tflite
 examples/~/
 ~/
-dist/
\ No newline at end of file
+dist/
+examples/files/__pycache__/
diff --git a/examples/00-simple-mnist-convnet.ipynb b/examples/00-simple-mnist-convnet.ipynb
index bf62606..c9ab344 100644
--- a/examples/00-simple-mnist-convnet.ipynb
+++ b/examples/00-simple-mnist-convnet.ipynb
@@ -63,19 +63,6 @@
     "model.summary()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from yaket.converter.converter import Converter\n",
-    "\n",
-    "path_m = '/root/project/yaket/models/1658392865_best_model'\n",
-    "c = Converter(out_format='tflite',model = None, model_path=path_m, out_path='model')\n",
-    "c.convert()\n"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/examples/01-asr-ctc.ipynb b/examples/01-asr-ctc.ipynb
index 3bab115..4836610 100644
--- a/examples/01-asr-ctc.ipynb
+++ b/examples/01-asr-ctc.ipynb
@@ -313,7 +313,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/examples/02-bit-image.ipynb b/examples/02-bit-image.ipynb
index e2eaa2c..4dd5957 100644
--- a/examples/02-bit-image.ipynb
+++ b/examples/02-bit-image.ipynb
@@ -2,19 +2,9 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2022-07-27 10:38:10.382444: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2022-07-27 10:38:10.387014: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
-      "2022-07-27 10:38:10.387027: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import numpy as np\n",
     "import pandas as pd\n",
@@ -35,7 +25,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -48,21 +38,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "AttributeError",
-     "evalue": "'PrefetchDataset' object has no attribute 'shape'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "\u001b[1;32m/root/project/yaket/examples/02-bit-image.ipynb Cell 3\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> <a href='vscode-notebook-cell://attached-container%2B7b22636f6e7461696e65724e616d65223a222f7562756e74752d61614b45227d/root/project/yaket/examples/02-bit-image.ipynb#ch0000017vscode-remote?line=0'>1</a>\u001b[0m train_ds\u001b[39m.\u001b[39;49mshape\n",
-      "\u001b[0;31mAttributeError\u001b[0m: 'PrefetchDataset' object has no attribute 'shape'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "train_ds"
    ]
@@ -210,20 +188,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "keras.optimizers.schedules.learning_rate_schedule.PiecewiseConstantDecay"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "getattr(keras.optimizers.schedules,'PiecewiseConstantDecay')"
    ]
@@ -232,16 +199,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "learning_rate = 0.003 * BATCH_SIZE / 512\n",
     "\n",
-- 
GitLab