diff --git a/.gitignore b/.gitignore
index cd56dce1..186220be 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,11 @@
 ## OS configs
 .DS_Store
 
+# Project
+data/*
+models/*
+reports/*
+
 # Python
 __pycache__
 .ipynb_checkpoints
diff --git a/README.md b/README.md
index 83039e3a..6fd7557f 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 ### 1. Fork / Clone this repository
 
 ```bash
-git clone https://gitlab.com/iterative.ai/cse/tutorials/course-ds-base.git
+git clone https://github.com/iterative/course-ds-base.git
 cd course-ds-base
 ```
 
@@ -15,6 +15,7 @@ cd course-ds-base
 Create virtual environment named `dvc-venv` (you may use other name)
 ```bash
 python3 -m venv dvc-venv
+echo "export PYTHONPATH=$PWD" >> dvc-venv/bin/activate
 source dvc-venv/bin/activate
 ```
 Install python libraries
diff --git a/data/.gitignore b/data/.gitignore
new file mode 100644
index 00000000..b6e069c5
--- /dev/null
+++ b/data/.gitignore
@@ -0,0 +1,3 @@
+*
+!*/
+!.gitignore
\ No newline at end of file
diff --git a/data/processed/.gitignore b/data/processed/.gitignore
new file mode 100644
index 00000000..6bd59f84
--- /dev/null
+++ b/data/processed/.gitignore
@@ -0,0 +1,4 @@
+!.gitignore
+!*.dvc
+/train_iris.csv
+/test_iris.csv
\ No newline at end of file
diff --git a/data/raw/.gitignore b/data/raw/.gitignore
new file mode 100644
index 00000000..3fc404be
--- /dev/null
+++ b/data/raw/.gitignore
@@ -0,0 +1,2 @@
+!.gitignore
+!*.dvc
\ No newline at end of file
diff --git a/models/.gitignore b/models/.gitignore
new file mode 100644
index 00000000..b722e9e1
--- /dev/null
+++ b/models/.gitignore
@@ -0,0 +1 @@
+!.gitignore
\ No newline at end of file
diff --git a/step-0-prototype.ipynb b/notebooks/step-0-prototype.ipynb
similarity index 100%
rename from step-0-prototype.ipynb
rename to notebooks/step-0-prototype.ipynb
diff --git a/notebooks/step-1-organize-ml-project.ipynb b/notebooks/step-1-organize-ml-project.ipynb
new file mode 100644
index 00000000..3a115fea
--- /dev/null
+++ b/notebooks/step-1-organize-ml-project.ipynb
@@ -0,0 +1,509 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:31.460557Z",
+     "start_time": "2019-06-16T21:17:29.395297Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import itertools\n",
+    "import joblib\n",
+    "import json\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.metrics import confusion_matrix, f1_score\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.model_selection import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:31.485189Z",
+     "start_time": "2019-06-16T21:17:31.473720Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get data \n",
+    "\n",
+    "import pandas as pd\n",
+    "from sklearn.datasets import load_iris\n",
+    "\n",
+    "data = load_iris(as_frame=True)\n",
+    "dataset = data.frame\n",
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print labels for target values \n",
+    "\n",
+    "[print(f'{target}: {label}') for target, label in zip(data.target.unique(), data.target_names)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:32.328046Z",
+     "start_time": "2019-06-16T21:17:32.323611Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# feature names\n",
+    "\n",
+    "dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]\n",
+    "\n",
+    "feature_names = dataset.columns.tolist()[:4]\n",
+    "feature_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save raw data\n",
+    "dataset_csv = '../data/raw/iris.csv'\n",
+    "dataset.to_csv(dataset_csv, index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Features engineering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:02.150708Z",
+     "start_time": "2019-06-16T21:21:02.144518Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset['sepal_length_to_sepal_width'] = dataset['sepal_length'] / dataset['sepal_width']\n",
+    "dataset['petal_length_to_petal_width'] = dataset['petal_length'] / dataset['petal_width']\n",
+    "\n",
+    "dataset = dataset[[\n",
+    "    'sepal_length', 'sepal_width', 'petal_length', 'petal_width',\n",
+    "#     'sepal_length_in_square', 'sepal_width_in_square', 'petal_length_in_square', 'petal_width_in_square',\n",
+    "    'sepal_length_to_sepal_width', 'petal_length_to_petal_width',\n",
+    "    'target'\n",
+    "]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:02.987144Z",
+     "start_time": "2019-06-16T21:21:02.976092Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save features\n",
+    "features_path = '../data/processed/featured_iris.csv'\n",
+    "dataset.to_csv(features_path, index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Split dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:07.438133Z",
+     "start_time": "2019-06-16T21:21:07.431649Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "random_state = 42\n",
+    "test_size = 0.2\n",
+    "\n",
+    "train_dataset, test_dataset = train_test_split(dataset, test_size=test_size, random_state=random_state)\n",
+    "train_dataset.shape, test_dataset.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save train and test sets\n",
+    "trainset_path = '../data/processed/train_iris.csv'\n",
+    "testset_path = '../data/processed/test_iris.csv'\n",
+    "\n",
+    "train_dataset.to_csv(trainset_path)\n",
+    "test_dataset.to_csv(testset_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:10.932148Z",
+     "start_time": "2019-06-16T21:21:10.927844Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get X and Y\n",
+    "\n",
+    "y_train = train_dataset.loc[:, 'target'].values.astype('int32')\n",
+    "X_train = train_dataset.drop('target', axis=1).values.astype('float32')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:55.427365Z",
+     "start_time": "2019-06-16T21:21:55.416431Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Create an instance of Logistic Regression Classifier CV and fit the data\n",
+    "clf_params = {\n",
+    "    'C': 0.001,\n",
+    "    'solver': 'lbfgs',\n",
+    "    'multi_class': 'multinomial',\n",
+    "    'max_iter': 100\n",
+    "}\n",
+    "\n",
+    "logreg = LogisticRegression(**clf_params, random_state=random_state)\n",
+    "logreg.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_path= '../models/model.joblib'\n",
+    "joblib.dump(logreg, model_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:55.875303Z",
+     "start_time": "2019-06-16T21:21:55.864724Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def plot_confusion_matrix(cm,\n",
+    "                          target_names,\n",
+    "                          title='Confusion matrix',\n",
+    "                          cmap=None,\n",
+    "                          normalize=True):\n",
+    "    \"\"\"\n",
+    "    given a sklearn confusion matrix (cm), make a nice plot\n",
+    "\n",
+    "    Arguments\n",
+    "    ---------\n",
+    "    cm:           confusion matrix from sklearn.metrics.confusion_matrix\n",
+    "\n",
+    "    target_names: given classification classes such as [0, 1, 2]\n",
+    "                  the class names, for example: ['high', 'medium', 'low']\n",
+    "\n",
+    "    title:        the text to display at the top of the matrix\n",
+    "\n",
+    "    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm\n",
+    "                  see http://matplotlib.org/examples/color/colormaps_reference.html\n",
+    "                  plt.get_cmap('jet') or plt.cm.Blues\n",
+    "\n",
+    "    normalize:    If False, plot the raw numbers\n",
+    "                  If True, plot the proportions\n",
+    "\n",
+    "    Usage\n",
+    "    -----\n",
+    "    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by\n",
+    "                                                              # sklearn.metrics.confusion_matrix\n",
+    "                          normalize    = True,                # show proportions\n",
+    "                          target_names = y_labels_vals,       # list of names of the classes\n",
+    "                          title        = best_estimator_name) # title of graph\n",
+    "\n",
+    "    Citiation\n",
+    "    ---------\n",
+    "    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html\n",
+    "\n",
+    "    \"\"\"\n",
+    "\n",
+    "    accuracy = np.trace(cm) / float(np.sum(cm))\n",
+    "    misclass = 1 - accuracy\n",
+    "\n",
+    "    if cmap is None:\n",
+    "        cmap = plt.get_cmap('Blues')\n",
+    "\n",
+    "    plt.figure(figsize=(8, 6))\n",
+    "    plt.imshow(cm, interpolation='nearest', cmap=cmap)\n",
+    "    plt.title(title)\n",
+    "    plt.colorbar()\n",
+    "\n",
+    "    if target_names is not None:\n",
+    "        tick_marks = np.arange(len(target_names))\n",
+    "        plt.xticks(tick_marks, target_names, rotation=45)\n",
+    "        plt.yticks(tick_marks, target_names)\n",
+    "\n",
+    "    if normalize:\n",
+    "        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
+    "\n",
+    "    thresh = cm.max() / 1.5 if normalize else cm.max() / 2\n",
+    "    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n",
+    "        if normalize:\n",
+    "            plt.text(j, i, \"{:0.4f}\".format(cm[i, j]),\n",
+    "                     horizontalalignment=\"center\",\n",
+    "                     color=\"white\" if cm[i, j] > thresh else \"black\")\n",
+    "        else:\n",
+    "            plt.text(j, i, \"{:,}\".format(cm[i, j]),\n",
+    "                     horizontalalignment=\"center\",\n",
+    "                     color=\"white\" if cm[i, j] > thresh else \"black\")\n",
+    "\n",
+    "    plt.tight_layout()\n",
+    "    plt.ylabel('True label')\n",
+    "    plt.xlabel('Predicted label\\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))\n",
+    "    \n",
+    "    return plt.gcf()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.090756Z",
+     "start_time": "2019-06-16T21:21:56.086966Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get X and Y\n",
+    "\n",
+    "y_test = test_dataset.loc[:, 'target'].values.astype('int32')\n",
+    "X_test = test_dataset.drop('target', axis=1).values.astype('float32')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.270245Z",
+     "start_time": "2019-06-16T21:21:56.265054Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "prediction = logreg.predict(X_test)\n",
+    "cm = confusion_matrix(prediction, y_test)\n",
+    "f1 = f1_score(y_true = y_test, y_pred = prediction, average='macro')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.493617Z",
+     "start_time": "2019-06-16T21:21:56.489929Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# f1 score value\n",
+    "f1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save metrics\n",
+    "metrics_file = '../reports/metrics.json'\n",
+    "\n",
+    "metrics = {\n",
+    "    'f1': f1\n",
+    "}\n",
+    "\n",
+    "with open(metrics_file, 'w') as mf:\n",
+    "    json.dump(\n",
+    "        obj=metrics,\n",
+    "        fp=mf,\n",
+    "        indent=4\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.966279Z",
+     "start_time": "2019-06-16T21:21:56.726149Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "cm_plot = plot_confusion_matrix(cm, data.target_names, normalize=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save confusion matrix image\n",
+    "confusion_matrix_image = '../reports/confusion_matrix.png'\n",
+    "cm_plot.savefig(confusion_matrix_image)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": true
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/step-2-create-config-file.ipynb b/notebooks/step-2-create-config-file.ipynb
new file mode 100644
index 00000000..1169b196
--- /dev/null
+++ b/notebooks/step-2-create-config-file.ipynb
@@ -0,0 +1,532 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:31.460557Z",
+     "start_time": "2019-06-16T21:17:29.395297Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import itertools\n",
+    "import joblib\n",
+    "import json\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.metrics import confusion_matrix, f1_score\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import yaml"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Go to project root folder\n",
+    "%cd .."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read config\n",
+    "import pprint\n",
+    "\n",
+    "with open('params.yaml') as conf_file:\n",
+    "    config = yaml.safe_load(conf_file)\n",
+    "\n",
+    "pprint.pprint(config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:31.485189Z",
+     "start_time": "2019-06-16T21:17:31.473720Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get data \n",
+    "\n",
+    "import pandas as pd\n",
+    "from sklearn.datasets import load_iris\n",
+    "\n",
+    "data = load_iris(as_frame=True)\n",
+    "dataset = data.frame\n",
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print labels for target values \n",
+    "\n",
+    "[print(f'{target}: {label}') for target, label in zip(data.target.unique(), data.target_names)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:32.328046Z",
+     "start_time": "2019-06-16T21:17:32.323611Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# feature names\n",
+    "\n",
+    "dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]\n",
+    "\n",
+    "feature_names = dataset.columns.tolist()[:4]\n",
+    "feature_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save raw data\n",
+    "dataset.to_csv(config['data']['dataset_csv'], index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Features engineering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:02.150708Z",
+     "start_time": "2019-06-16T21:21:02.144518Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset['sepal_length_to_sepal_width'] = dataset['sepal_length'] / dataset['sepal_width']\n",
+    "dataset['petal_length_to_petal_width'] = dataset['petal_length'] / dataset['petal_width']\n",
+    "\n",
+    "dataset = dataset[[\n",
+    "    'sepal_length', 'sepal_width', 'petal_length', 'petal_width',\n",
+    "#     'sepal_length_in_square', 'sepal_width_in_square', 'petal_length_in_square', 'petal_width_in_square',\n",
+    "    'sepal_length_to_sepal_width', 'petal_length_to_petal_width',\n",
+    "    'target'\n",
+    "]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:02.987144Z",
+     "start_time": "2019-06-16T21:21:02.976092Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save features\n",
+    "dataset.to_csv(config['data']['features_path'], index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Split dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:07.438133Z",
+     "start_time": "2019-06-16T21:21:07.431649Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "train_dataset, test_dataset = train_test_split(\n",
+    "    dataset, test_size=config['data']['test_size'],\n",
+    "    random_state=config['base']['random_state']\n",
+    ")\n",
+    "train_dataset.shape, test_dataset.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save train and test sets\n",
+    "train_dataset.to_csv(config['data']['trainset_path'])\n",
+    "test_dataset.to_csv(config['data']['testset_path'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:10.932148Z",
+     "start_time": "2019-06-16T21:21:10.927844Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get X and Y\n",
+    "\n",
+    "y_train = train_dataset.loc[:, 'target'].values.astype('int32')\n",
+    "X_train = train_dataset.drop('target', axis=1).values.astype('float32')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:55.427365Z",
+     "start_time": "2019-06-16T21:21:55.416431Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Create an instance of Logistic Regression Classifier CV and fit the data\n",
+    "\n",
+    "logreg = LogisticRegression(\n",
+    "    **config['train']['clf_params'],\n",
+    "    random_state=config['base']['random_state']\n",
+    ")\n",
+    "logreg.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "joblib.dump(logreg, config['train']['model_path'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:55.875303Z",
+     "start_time": "2019-06-16T21:21:55.864724Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def plot_confusion_matrix(cm,\n",
+    "                          target_names,\n",
+    "                          title='Confusion matrix',\n",
+    "                          cmap=None,\n",
+    "                          normalize=True):\n",
+    "    \"\"\"\n",
+    "    given a sklearn confusion matrix (cm), make a nice plot\n",
+    "\n",
+    "    Arguments\n",
+    "    ---------\n",
+    "    cm:           confusion matrix from sklearn.metrics.confusion_matrix\n",
+    "\n",
+    "    target_names: given classification classes such as [0, 1, 2]\n",
+    "                  the class names, for example: ['high', 'medium', 'low']\n",
+    "\n",
+    "    title:        the text to display at the top of the matrix\n",
+    "\n",
+    "    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm\n",
+    "                  see http://matplotlib.org/examples/color/colormaps_reference.html\n",
+    "                  plt.get_cmap('jet') or plt.cm.Blues\n",
+    "\n",
+    "    normalize:    If False, plot the raw numbers\n",
+    "                  If True, plot the proportions\n",
+    "\n",
+    "    Usage\n",
+    "    -----\n",
+    "    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by\n",
+    "                                                              # sklearn.metrics.confusion_matrix\n",
+    "                          normalize    = True,                # show proportions\n",
+    "                          target_names = y_labels_vals,       # list of names of the classes\n",
+    "                          title        = best_estimator_name) # title of graph\n",
+    "\n",
+    "    Citiation\n",
+    "    ---------\n",
+    "    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html\n",
+    "\n",
+    "    \"\"\"\n",
+    "\n",
+    "    accuracy = np.trace(cm) / float(np.sum(cm))\n",
+    "    misclass = 1 - accuracy\n",
+    "\n",
+    "    if cmap is None:\n",
+    "        cmap = plt.get_cmap('Blues')\n",
+    "\n",
+    "    plt.figure(figsize=(8, 6))\n",
+    "    plt.imshow(cm, interpolation='nearest', cmap=cmap)\n",
+    "    plt.title(title)\n",
+    "    plt.colorbar()\n",
+    "\n",
+    "    if target_names is not None:\n",
+    "        tick_marks = np.arange(len(target_names))\n",
+    "        plt.xticks(tick_marks, target_names, rotation=45)\n",
+    "        plt.yticks(tick_marks, target_names)\n",
+    "\n",
+    "    if normalize:\n",
+    "        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
+    "\n",
+    "    thresh = cm.max() / 1.5 if normalize else cm.max() / 2\n",
+    "    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n",
+    "        if normalize:\n",
+    "            plt.text(j, i, \"{:0.4f}\".format(cm[i, j]),\n",
+    "                     horizontalalignment=\"center\",\n",
+    "                     color=\"white\" if cm[i, j] > thresh else \"black\")\n",
+    "        else:\n",
+    "            plt.text(j, i, \"{:,}\".format(cm[i, j]),\n",
+    "                     horizontalalignment=\"center\",\n",
+    "                     color=\"white\" if cm[i, j] > thresh else \"black\")\n",
+    "\n",
+    "    plt.tight_layout()\n",
+    "    plt.ylabel('True label')\n",
+    "    plt.xlabel('Predicted label\\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))\n",
+    "    \n",
+    "    return plt.gcf()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.090756Z",
+     "start_time": "2019-06-16T21:21:56.086966Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get X and Y\n",
+    "\n",
+    "y_test = test_dataset.loc[:, 'target'].values.astype('int32')\n",
+    "X_test = test_dataset.drop('target', axis=1).values.astype('float32')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.270245Z",
+     "start_time": "2019-06-16T21:21:56.265054Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "prediction = logreg.predict(X_test)\n",
+    "cm = confusion_matrix(prediction, y_test)\n",
+    "f1 = f1_score(y_true = y_test, y_pred = prediction, average='macro')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.493617Z",
+     "start_time": "2019-06-16T21:21:56.489929Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# f1 score value\n",
+    "f1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save metrics\n",
+    "metrics = {\n",
+    "    'f1': f1\n",
+    "}\n",
+    "\n",
+    "with open(config['reports']['metrics_file'], 'w') as mf:\n",
+    "    json.dump(\n",
+    "        obj=metrics,\n",
+    "        fp=mf,\n",
+    "        indent=4\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.966279Z",
+     "start_time": "2019-06-16T21:21:56.726149Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "cm_plot = plot_confusion_matrix(cm, data.target_names, normalize=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save confusion matrix image\n",
+    "cm_plot.savefig(config['reports']['confusion_matrix_image'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "75b1920f9cbcc979e2f7542df3a177962ac1d8ba339eed6df458570447bf37f9"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.9.4 64-bit ('dvc-venv': venv)",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.4"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": true
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/step-3-reusable-code.ipynb b/notebooks/step-3-reusable-code.ipynb
new file mode 100644
index 00000000..c083cdb5
--- /dev/null
+++ b/notebooks/step-3-reusable-code.ipynb
@@ -0,0 +1,436 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:31.460557Z",
+     "start_time": "2019-06-16T21:17:29.395297Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import itertools\n",
+    "import joblib\n",
+    "import json\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.metrics import confusion_matrix, f1_score\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import yaml\n",
+    "\n",
+    "# import plot_confusion_matrix()\n",
+    "from src.report.visualize import plot_confusion_matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Go to project root folder\n",
+    "%cd .."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read config\n",
+    "import pprint\n",
+    "\n",
+    "with open('params.yaml') as conf_file:\n",
+    "    config = yaml.safe_load(conf_file)\n",
+    "\n",
+    "pprint.pprint(config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:31.485189Z",
+     "start_time": "2019-06-16T21:17:31.473720Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get data \n",
+    "\n",
+    "import pandas as pd\n",
+    "from sklearn.datasets import load_iris\n",
+    "\n",
+    "data = load_iris(as_frame=True)\n",
+    "dataset = data.frame\n",
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print labels for target values \n",
+    "\n",
+    "[print(f'{target}: {label}') for target, label in zip(data.target.unique(), data.target_names)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:32.328046Z",
+     "start_time": "2019-06-16T21:17:32.323611Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# feature names\n",
+    "\n",
+    "dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]\n",
+    "\n",
+    "feature_names = dataset.columns.tolist()[:4]\n",
+    "feature_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save raw data\n",
+    "dataset.to_csv(config['data']['dataset_csv'], index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Features engineering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:02.150708Z",
+     "start_time": "2019-06-16T21:21:02.144518Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset['sepal_length_to_sepal_width'] = dataset['sepal_length'] / dataset['sepal_width']\n",
+    "dataset['petal_length_to_petal_width'] = dataset['petal_length'] / dataset['petal_width']\n",
+    "\n",
+    "dataset = dataset[[\n",
+    "    'sepal_length', 'sepal_width', 'petal_length', 'petal_width',\n",
+    "#     'sepal_length_in_square', 'sepal_width_in_square', 'petal_length_in_square', 'petal_width_in_square',\n",
+    "    'sepal_length_to_sepal_width', 'petal_length_to_petal_width',\n",
+    "    'target'\n",
+    "]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:02.987144Z",
+     "start_time": "2019-06-16T21:21:02.976092Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save features\n",
+    "dataset.to_csv(config['data']['features_path'], index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Split dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:07.438133Z",
+     "start_time": "2019-06-16T21:21:07.431649Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "train_dataset, test_dataset = train_test_split(\n",
+    "    dataset, test_size=config['data']['test_size'],\n",
+    "    random_state=config['base']['random_state']\n",
+    ")\n",
+    "train_dataset.shape, test_dataset.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save train and test sets\n",
+    "train_dataset.to_csv(config['data']['trainset_path'])\n",
+    "test_dataset.to_csv(config['data']['testset_path'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:10.932148Z",
+     "start_time": "2019-06-16T21:21:10.927844Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get X and Y\n",
+    "\n",
+    "y_train = train_dataset.loc[:, 'target'].values.astype('int32')\n",
+    "X_train = train_dataset.drop('target', axis=1).values.astype('float32')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:55.427365Z",
+     "start_time": "2019-06-16T21:21:55.416431Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Create an instance of Logistic Regression Classifier CV and fit the data\n",
+    "\n",
+    "logreg = LogisticRegression(\n",
+    "    **config['train']['clf_params'],\n",
+    "    random_state=config['base']['random_state']\n",
+    ")\n",
+    "logreg.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "joblib.dump(logreg, config['train']['model_path'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.090756Z",
+     "start_time": "2019-06-16T21:21:56.086966Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get X and Y\n",
+    "\n",
+    "y_test = test_dataset.loc[:, 'target'].values.astype('int32')\n",
+    "X_test = test_dataset.drop('target', axis=1).values.astype('float32')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.270245Z",
+     "start_time": "2019-06-16T21:21:56.265054Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "prediction = logreg.predict(X_test)\n",
+    "cm = confusion_matrix(prediction, y_test)\n",
+    "f1 = f1_score(y_true = y_test, y_pred = prediction, average='macro')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.493617Z",
+     "start_time": "2019-06-16T21:21:56.489929Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# f1 score value\n",
+    "f1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save metrics\n",
+    "metrics = {\n",
+    "    'f1': f1\n",
+    "}\n",
+    "\n",
+    "with open(config['reports']['metrics_file'], 'w') as mf:\n",
+    "    json.dump(\n",
+    "        obj=metrics,\n",
+    "        fp=mf,\n",
+    "        indent=4\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.966279Z",
+     "start_time": "2019-06-16T21:21:56.726149Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "cm_plot = plot_confusion_matrix(cm, data.target_names, normalize=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save confusion matrix image\n",
+    "cm_plot.savefig(config['reports']['confusion_matrix_image'])"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "75b1920f9cbcc979e2f7542df3a177962ac1d8ba339eed6df458570447bf37f9"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.4"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": true
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/step-4-build-ml-pipeline.ipynb b/notebooks/step-4-build-ml-pipeline.ipynb
new file mode 100644
index 00000000..5f6246f5
--- /dev/null
+++ b/notebooks/step-4-build-ml-pipeline.ipynb
@@ -0,0 +1,442 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:31.460557Z",
+     "start_time": "2019-06-16T21:17:29.395297Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import itertools\n",
+    "import joblib\n",
+    "import json\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.metrics import confusion_matrix, f1_score\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import yaml\n",
+    "\n",
+    "# import plot_confusion_matrix()\n",
+    "from src.report.visualize import plot_confusion_matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/Users/jenif/course-ds-base\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Go to project root folder\n",
+    "%cd .."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'base': {'random_state': 42},\n",
+      " 'data': {'dataset_csv': 'data/raw/iris.csv',\n",
+      "          'features_path': 'data/processed/featured_iris.csv',\n",
+      "          'test_size': 0.2,\n",
+      "          'testset_path': 'data/processed/test_iris.csv',\n",
+      "          'trainset_path': 'data/processed/train_iris.csv'},\n",
+      " 'reports': {'confusion_matrix_image': 'reports/confusion_matrix.png',\n",
+      "             'metrics_file': 'reports/metrics.json'},\n",
+      " 'train': {'clf_params': {'C': 0.001,\n",
+      "                          'max_iter': 100,\n",
+      "                          'multi_class': 'multinomial',\n",
+      "                          'solver': 'lbfgs'},\n",
+      "           'model_path': 'models/model.joblib'}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Read config\n",
+    "import pprint\n",
+    "\n",
+    "with open('params.yaml') as conf_file:\n",
+    "    config = yaml.safe_load(conf_file)\n",
+    "\n",
+    "pprint.pprint(config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data Load complete\n"
+     ]
+    }
+   ],
+   "source": [
+    "from src.stages.data_load import data_load\n",
+    "\n",
+    "data_load(config_path = \"params.yaml\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data Load complete\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!python src/stages/data_load.py --config=params.yaml"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Features engineering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:02.150708Z",
+     "start_time": "2019-06-16T21:21:02.144518Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset['sepal_length_to_sepal_width'] = dataset['sepal_length'] / dataset['sepal_width']\n",
+    "dataset['petal_length_to_petal_width'] = dataset['petal_length'] / dataset['petal_width']\n",
+    "\n",
+    "dataset = dataset[[\n",
+    "    'sepal_length', 'sepal_width', 'petal_length', 'petal_width',\n",
+    "#     'sepal_length_in_square', 'sepal_width_in_square', 'petal_length_in_square', 'petal_width_in_square',\n",
+    "    'sepal_length_to_sepal_width', 'petal_length_to_petal_width',\n",
+    "    'target'\n",
+    "]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:02.987144Z",
+     "start_time": "2019-06-16T21:21:02.976092Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save features\n",
+    "dataset.to_csv(config['data']['features_path'], index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Split dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:07.438133Z",
+     "start_time": "2019-06-16T21:21:07.431649Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "train_dataset, test_dataset = train_test_split(\n",
+    "    dataset, test_size=config['data']['test_size'],\n",
+    "    random_state=config['base']['random_state']\n",
+    ")\n",
+    "train_dataset.shape, test_dataset.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save train and test sets\n",
+    "train_dataset.to_csv(config['data']['trainset_path'])\n",
+    "test_dataset.to_csv(config['data']['testset_path'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:10.932148Z",
+     "start_time": "2019-06-16T21:21:10.927844Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get X and Y\n",
+    "\n",
+    "y_train = train_dataset.loc[:, 'target'].values.astype('int32')\n",
+    "X_train = train_dataset.drop('target', axis=1).values.astype('float32')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:55.427365Z",
+     "start_time": "2019-06-16T21:21:55.416431Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Create an instance of Logistic Regression Classifier CV and fit the data\n",
+    "\n",
+    "logreg = LogisticRegression(\n",
+    "    **config['train']['clf_params'],\n",
+    "    random_state=config['base']['random_state']\n",
+    ")\n",
+    "logreg.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "joblib.dump(logreg, config['train']['model_path'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.090756Z",
+     "start_time": "2019-06-16T21:21:56.086966Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get X and Y\n",
+    "\n",
+    "y_test = test_dataset.loc[:, 'target'].values.astype('int32')\n",
+    "X_test = test_dataset.drop('target', axis=1).values.astype('float32')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.270245Z",
+     "start_time": "2019-06-16T21:21:56.265054Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "prediction = logreg.predict(X_test)\n",
+    "cm = confusion_matrix(prediction, y_test)\n",
+    "f1 = f1_score(y_true = y_test, y_pred = prediction, average='macro')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.493617Z",
+     "start_time": "2019-06-16T21:21:56.489929Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# f1 score value\n",
+    "f1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save metrics\n",
+    "metrics = {\n",
+    "    'f1': f1\n",
+    "}\n",
+    "\n",
+    "with open(config['reports']['metrics_file'], 'w') as mf:\n",
+    "    json.dump(\n",
+    "        obj=metrics,\n",
+    "        fp=mf,\n",
+    "        indent=4\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.966279Z",
+     "start_time": "2019-06-16T21:21:56.726149Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "cm_plot = plot_confusion_matrix(cm, data.target_names, normalize=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save confusion matrix image\n",
+    "cm_plot.savefig(config['reports']['confusion_matrix_image'])"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "75b1920f9cbcc979e2f7542df3a177962ac1d8ba339eed6df458570447bf37f9"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.2"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": true
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/params.yaml b/params.yaml
new file mode 100644
index 00000000..a48a0538
--- /dev/null
+++ b/params.yaml
@@ -0,0 +1,22 @@
+base:
+  random_state: 42
+
+data:
+  dataset_csv: 'data/raw/iris.csv'
+  features_path: 'data/processed/featured_iris.csv'
+  test_size: 0.2
+  trainset_path: 'data/processed/train_iris.csv'
+  testset_path: 'data/processed/test_iris.csv'
+
+
+train:
+  clf_params:
+    'C': 0.001
+    'solver': 'lbfgs'
+    'multi_class': 'multinomial'
+    'max_iter': 100
+  model_path: 'models/model.joblib'
+
+reports:
+  metrics_file: 'reports/metrics.json'
+  confusion_matrix_image: 'reports/confusion_matrix.png'
\ No newline at end of file
diff --git a/reports/.gitignore b/reports/.gitignore
new file mode 100644
index 00000000..b722e9e1
--- /dev/null
+++ b/reports/.gitignore
@@ -0,0 +1 @@
+!.gitignore
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index d5b4910e..d04337a4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-dvc==2.6.4
+dvc==2.8.3
 joblib==1.0.1
 jupyter==1.0.0
 jupyter_contrib_nbextensions==0.5.1
diff --git a/src/report/visualize.py b/src/report/visualize.py
new file mode 100644
index 00000000..7656d4b3
--- /dev/null
+++ b/src/report/visualize.py
@@ -0,0 +1,80 @@
+import itertools
+import matplotlib.colors
+import matplotlib.pyplot as plt
+import numpy as np
+from typing import List, Text
+
+
+def plot_confusion_matrix(cm: np.array,
+                          target_names: List[Text],
+                          title: Text = 'Confusion matrix',
+                          cmap: matplotlib.colors.LinearSegmentedColormap = None,
+                          normalize: bool = True):
+    """
+    given a sklearn confusion matrix (cm), make a nice plot
+
+    Arguments
+    ---------
+    cm:           confusion matrix from sklearn.metrics.confusion_matrix
+
+    target_names: given classification classes such as [0, 1, 2]
+                  the class names, for example: ['high', 'medium', 'low']
+
+    title:        the text to display at the top of the matrix
+
+    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
+                  see http://matplotlib.org/examples/color/colormaps_reference.html
+                  plt.get_cmap('jet') or plt.cm.Blues
+
+    normalize:    If False, plot the raw numbers
+                  If True, plot the proportions
+
+    Usage
+    -----
+    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
+                                                              # sklearn.metrics.confusion_matrix
+                          normalize    = True,                # show proportions
+                          target_names = y_labels_vals,       # list of names of the classes
+                          title        = best_estimator_name) # title of graph
+
+    Citiation
+    ---------
+    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
+
+    """
+
+    accuracy = np.trace(cm) / float(np.sum(cm))
+    misclass = 1 - accuracy
+
+    if cmap is None:
+        cmap = plt.get_cmap('Blues')
+
+    plt.figure(figsize=(8, 6))
+    plt.imshow(cm, interpolation='nearest', cmap=cmap)
+    plt.title(title)
+    plt.colorbar()
+
+    if target_names is not None:
+        tick_marks = np.arange(len(target_names))
+        plt.xticks(tick_marks, target_names, rotation=45)
+        plt.yticks(tick_marks, target_names)
+
+    if normalize:
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+
+    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
+    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+        if normalize:
+            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
+                     horizontalalignment="center",
+                     color="white" if cm[i, j] > thresh else "black")
+        else:
+            plt.text(j, i, "{:,}".format(cm[i, j]),
+                     horizontalalignment="center",
+                     color="white" if cm[i, j] > thresh else "black")
+
+    plt.tight_layout()
+    plt.ylabel('True label')
+    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
+
+    return plt.gcf()
diff --git a/src/stages/data_load.py b/src/stages/data_load.py
new file mode 100644
index 00000000..36e8efb1
--- /dev/null
+++ b/src/stages/data_load.py
@@ -0,0 +1,27 @@
+import argparse
+import pandas as pd
+from sklearn.datasets import load_iris
+from typing import Text
+import yaml
+
+
+def data_load(config_path: Text) -> None:
+
+    with open(config_path) as conf_file:
+        config = yaml.safe_load(conf_file)
+
+    data = load_iris(as_frame=True)
+    dataset = data.frame
+     
+    dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]
+    dataset.to_csv(config['data']['dataset_csv'], index=False)
+
+    print("Data Load complete")
+
+if __name__ == '__main__':
+
+    args_parser = argparse.ArgumentParser()
+    args_parser.add_argument('--config', dest='config', required=True)
+    args = args_parser.parse_args()
+
+    data_load(config_path=args.config)
\ No newline at end of file