swift-nav
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎albatross/core/dataset.h‎
Lines changed: 91 additions & 0 deletions b/‎albatross/core/dataset.h‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎albatross/core/distribution.h‎
Lines changed: 0 additions & 13 deletions b/‎albatross/core/distribution.h‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎albatross/core/indexing.h‎
Lines changed: 174 additions & 1 deletion b/‎albatross/core/indexing.h‎
Lines changed: 174 additions & 1 deletion
@@ -41,6 +41,7 @@ set(albatross_HEADERS
     albatross/map_utils.h
     albatross/csv_utils.h
     albatross/core/keys.h
+    albatross/core/dataset.h
     albatross/core/model.h
     albatross/core/model_adapter.h
     albatross/core/traits.h
 
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2018 Swift Navigation Inc.
+ * Contact: Swift Navigation <[email protected]>
+ *
+ * This source is subject to the license found in the file 'LICENSE' which must
+ * be distributed together with this source. All other rights reserved.
+ *
+ * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND,
+ * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#ifndef ALBATROSS_CORE_DATASET_H
+#define ALBATROSS_CORE_DATASET_H
+
+#include "core/distribution.h"
+#include "core/traits.h"
+#include <Eigen/Core>
+#include <cereal/archives/json.hpp>
+#include <map>
+#include <vector>
+
+namespace albatross {
+
+// A JointDistribution has a dense covariance matrix, which
+// contains the covariance between each variable and all others.
+using JointDistribution = Distribution<Eigen::MatrixXd>;
+
+// We use a wrapper around DiagonalMatrix in order to make
+// the resulting distribution serializable
+using DiagonalMatrixXd =
+    Eigen::SerializableDiagonalMatrix<double, Eigen::Dynamic>;
+// A MarginalDistribution has only a digaonal covariance
+// matrix, so in turn only describes the variance of each
+// variable independent of all others.
+using MarginalDistribution = Distribution<DiagonalMatrixXd>;
+
+/*
+ * A RegressionDataset holds two vectors of data, the features
+ * where a single feature can be any class that contains the information used
+ * to make predictions of the target.  This is called a RegressionDataset since
+ * it is assumed that each feature is regressed to a single double typed
+ * target.
+ */
+template <typename FeatureType> struct RegressionDataset {
+  std::vector<FeatureType> features;
+  MarginalDistribution targets;
+  std::map<std::string, std::string> metadata;
+
+  RegressionDataset(){};
+
+  RegressionDataset(const std::vector<FeatureType> &features_,
+                    const MarginalDistribution &targets_)
+      : features(features_), targets(targets_) {
+    // If the two inputs aren't the same size they clearly aren't
+    // consistent.
+    assert(static_cast<int>(features.size()) ==
+           static_cast<int>(targets.size()));
+  }
+
+  RegressionDataset(const std::vector<FeatureType> &features_,
+                    const Eigen::VectorXd &targets_)
+      : RegressionDataset(features_, MarginalDistribution(targets_)) {}
+
+  bool operator==(const RegressionDataset &other) const {
+    return (features == other.features && targets == other.targets &&
+            metadata == other.metadata);
+  }
+
+  template <class Archive>
+  typename std::enable_if<valid_in_out_serializer<FeatureType, Archive>::value,
+                          void>::type
+  serialize(Archive &archive) {
+    archive(cereal::make_nvp("features", features));
+    archive(cereal::make_nvp("targets", targets));
+    archive(cereal::make_nvp("metadata", metadata));
+  }
+
+  template <class Archive>
+  typename std::enable_if<!valid_in_out_serializer<FeatureType, Archive>::value,
+                          void>::type
+  serialize(Archive &archive) {
+    static_assert(delay_static_assert<Archive>::value,
+                  "In order to serialize a RegressionDataset the corresponding "
+                  "FeatureType must be serializable.");
+  }
+};
+
+} // namespace albatross
+
+#endif
@@ -16,7 +16,6 @@
 #include "cereal/cereal.hpp"
 #include "core/traits.h"
 #include "eigen/serializable_diagonal_matrix.h"
-#include "indexing.h"
 #include <Eigen/Core>
 #include <iostream>
 #include <map>
@@ -109,18 +108,6 @@ using DiagonalMatrixXd =
 // variable independent of all others.
 using MarginalDistribution = Distribution<DiagonalMatrixXd>;
 
-template <typename CovarianceType, typename SizeType>
-Distribution<CovarianceType> subset(const std::vector<SizeType> &indices,
-                                    const Distribution<CovarianceType> &dist) {
-  auto mean = subset(indices, Eigen::VectorXd(dist.mean));
-  if (dist.has_covariance()) {
-    auto cov = symmetric_subset(indices, dist.covariance);
-    return Distribution<CovarianceType>(mean, cov);
-  } else {
-    return Distribution<CovarianceType>(mean);
-  }
-}
-
 } // namespace albatross
 
 #endif
@@ -13,16 +13,23 @@
 #ifndef ALBATROSS_CORE_INDEXING_H
 #define ALBATROSS_CORE_INDEXING_H
 
+#include "core/dataset.h"
 #include <Eigen/Core>
+#include <functional>
 #include <iostream>
+#include <map>
 #include <vector>
 
 namespace albatross {
 
+using s32 = int32_t;
+using FoldIndices = std::vector<s32>;
+using FoldName = std::string;
+using FoldIndexer = std::map<FoldName, FoldIndices>;
+
 /*
  * Extract a subset of a standard vector.
  */
-
 template <typename SizeType, typename X>
 inline std::vector<X> subset(const std::vector<SizeType> &indices,
                              const std::vector<X> &v) {
@@ -47,6 +54,21 @@ inline Eigen::VectorXd subset(const std::vector<SizeType> &indices,
   return out;
 }
 
+/*
+ * Extracts a subset of columns from an Eigen::Matrix
+ */
+template <typename SizeType>
+inline Eigen::MatrixXd subset_cols(const std::vector<SizeType> &col_indices,
+                                   const Eigen::MatrixXd &v) {
+  Eigen::MatrixXd out(v.rows(), col_indices.size());
+  for (std::size_t i = 0; i < col_indices.size(); i++) {
+    auto ii = static_cast<Eigen::Index>(i);
+    auto col_index = static_cast<Eigen::Index>(col_indices[i]);
+    out.col(ii) = v.col(col_index);
+  }
+  return out;
+}
+
 /*
  * Extracts a subset of an Eigen::Matrix for the given row and column
  * indices.
@@ -90,6 +112,157 @@ symmetric_subset(const std::vector<SizeType> &indices,
   return subset(indices, v.diagonal()).asDiagonal();
 }
 
+template <typename CovarianceType, typename SizeType>
+Distribution<CovarianceType> subset(const std::vector<SizeType> &indices,
+                                    const Distribution<CovarianceType> &dist) {
+  auto mean = subset(indices, Eigen::VectorXd(dist.mean));
+  if (dist.has_covariance()) {
+    auto cov = symmetric_subset(indices, dist.covariance);
+    return Distribution<CovarianceType>(mean, cov);
+  } else {
+    return Distribution<CovarianceType>(mean);
+  }
+}
+
+/*
+ * A combination of training and testing datasets, typically used in cross
+ * validation.
+ */
+template <typename FeatureType> struct RegressionFold {
+  RegressionDataset<FeatureType> train_dataset;
+  RegressionDataset<FeatureType> test_dataset;
+  FoldName name;
+  FoldIndices test_indices;
+
+  RegressionFold(const RegressionDataset<FeatureType> &train_dataset_,
+                 const RegressionDataset<FeatureType> &test_dataset_,
+                 const FoldName &name_, const FoldIndices &test_indices_)
+      : train_dataset(train_dataset_), test_dataset(test_dataset_), name(name_),
+        test_indices(test_indices_){};
+};
+
+inline FoldIndices get_train_indices(const FoldIndices &test_indices,
+                                     const int n) {
+  const s32 k = static_cast<s32>(test_indices.size());
+  // The train indices are all the indices that are not test indices.
+  FoldIndices train_indices(n - k);
+  s32 train_cnt = 0;
+  for (s32 j = 0; j < n; j++) {
+    if (std::find(test_indices.begin(), test_indices.end(), j) ==
+        test_indices.end()) {
+      train_indices[train_cnt] = j;
+      train_cnt++;
+    }
+  }
+  return train_indices;
+}
+
+/*
+ * Each flavor of cross validation can be described by a set of
+ * FoldIndices, which store which indices should be used for the
+ * test cases.  This function takes a map from FoldName to
+ * FoldIndices and a dataset and creates the resulting folds.
+ */
+template <typename FeatureType>
+static inline std::vector<RegressionFold<FeatureType>>
+folds_from_fold_indexer(const RegressionDataset<FeatureType> &dataset,
+                        const FoldIndexer &groups) {
+  // For a dataset with n features, we'll have n folds.
+  const s32 n = static_cast<s32>(dataset.features.size());
+  std::vector<RegressionFold<FeatureType>> folds;
+  // For each fold, partition into train and test sets.
+  for (const auto &pair : groups) {
+    // These get exposed inside the returned RegressionFold and because
+    // we'd like to prevent modification of the output from this function
+    // from changing the input FoldIndexer we perform a copy here.
+    const FoldName group_name(pair.first);
+    const FoldIndices test_indices(pair.second);
+    const auto train_indices = get_train_indices(test_indices, n);
+
+    std::vector<FeatureType> train_features =
+        subset(train_indices, dataset.features);
+    MarginalDistribution train_targets = subset(train_indices, dataset.targets);
+
+    std::vector<FeatureType> test_features =
+        subset(test_indices, dataset.features);
+    MarginalDistribution test_targets = subset(test_indices, dataset.targets);
+
+    assert(train_features.size() == train_targets.size());
+    assert(test_features.size() == test_targets.size());
+    assert(test_targets.size() + train_targets.size() == n);
+
+    const RegressionDataset<FeatureType> train_split(train_features,
+                                                     train_targets);
+    const RegressionDataset<FeatureType> test_split(test_features,
+                                                    test_targets);
+    folds.push_back(RegressionFold<FeatureType>(train_split, test_split,
+                                                group_name, test_indices));
+  }
+  return folds;
+}
+
+template <typename FeatureType>
+static inline FoldIndexer
+leave_one_out_indexer(const RegressionDataset<FeatureType> &dataset) {
+  FoldIndexer groups;
+  for (s32 i = 0; i < static_cast<s32>(dataset.features.size()); i++) {
+    FoldName group_name = std::to_string(i);
+    groups[group_name] = {i};
+  }
+  return groups;
+}
+
+/*
+ * Splits a dataset into cross validation folds where each fold contains all but
+ * one predictor/target pair.
+ */
+template <typename FeatureType>
+static inline FoldIndexer leave_one_group_out_indexer(
+    const RegressionDataset<FeatureType> &dataset,
+    const std::function<FoldName(const FeatureType &)> &get_group_name) {
+  FoldIndexer groups;
+  for (s32 i = 0; i < static_cast<s32>(dataset.features.size()); i++) {
+    const std::string k =
+        get_group_name(dataset.features[static_cast<std::size_t>(i)]);
+    // Get the existing indices if we've already encountered this group_name
+    // otherwise initialize a new one.
+    FoldIndices indices;
+    if (groups.find(k) == groups.end()) {
+      indices = FoldIndices();
+    } else {
+      indices = groups[k];
+    }
+    // Add the current index.
+    indices.push_back(i);
+    groups[k] = indices;
+  }
+  return groups;
+}
+
+/*
+ * Generates cross validation folds which represent leave one out
+ * cross validation.
+ */
+template <typename FeatureType>
+static inline std::vector<RegressionFold<FeatureType>>
+leave_one_out(const RegressionDataset<FeatureType> &dataset) {
+  return folds_from_fold_indexer<FeatureType>(
+      dataset, leave_one_out_indexer<FeatureType>(dataset));
+}
+
+/*
+ * Uses a `get_group_name` function to bucket each FeatureType into
+ * a group, then holds out one group at a time.
+ */
+template <typename FeatureType>
+static inline std::vector<RegressionFold<FeatureType>> leave_one_group_out(
+    const RegressionDataset<FeatureType> &dataset,
+    const std::function<FoldName(const FeatureType &)> &get_group_name) {
+  const FoldIndexer indexer =
+      leave_one_group_out_indexer<FeatureType>(dataset, get_group_name);
+  return folds_from_fold_indexer<FeatureType>(dataset, indexer);
+}
+
 } // namespace albatross
 
 #endif