From a011d2147a35c2a1992101b4fc4fcb73e522ad1d Mon Sep 17 00:00:00 2001
From: David Alonso <dam.phys@gmail.com>
Date: Sat, 22 Aug 2020 16:11:32 +0100
Subject: [PATCH 01/35] optimizer

---
 the_atlantics/example.py           |  74 +++++++++
 the_atlantics/example_minimizer.py |  74 +++++++++
 the_atlantics/snc.py               | 259 +++++++++++++++++++++++++++++
 3 files changed, 407 insertions(+)
 create mode 100644 the_atlantics/example.py
 create mode 100644 the_atlantics/example_minimizer.py
 create mode 100644 the_atlantics/snc.py

diff --git a/the_atlantics/example.py b/the_atlantics/example.py
new file mode 100644
index 00000000..76a8a8fd
--- /dev/null
+++ b/the_atlantics/example.py
@@ -0,0 +1,74 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from snc import SnCalc
+
+
+# Let's start with a "large" set of 12 samples
+n_bins = 12
+z_cent = np.linspace(0.2, 1.4, n_bins+1)+0.5/n_bins
+sigma_z = 0.05 * (1+z_cent)
+z_arr = np.linspace(0, 2, 1024)
+# Total number of galaxies in each sample
+n_gals = (z_cent/0.5)**2*np.exp(-(z_cent/0.5)**1.5) * 10. * 1.8E4 * 60**2
+
+
+def pz_gau(z, zc, sz):
+    return np.exp(-0.5*((z-zc)/sz)**2)/np.sqrt(2*np.pi*sz**2)
+
+
+# Create the N(z) arrays, which should integrate to the total number
+# of galaxies in each sample
+nzs = [ng * pz_gau(z_arr, zc, sz)
+       for ng, zc, sz in zip(n_gals, z_cent, sigma_z)]
+
+# Let's plot them
+plt.figure()
+for nz in nzs:
+    plt.plot(z_arr, nz)
+plt.xlabel(r'$z$', fontsize=16)
+plt.ylabel(r'$N(z)$', fontsize=16)
+
+
+c = SnCalc(z_arr, nzs)
+# Now let's do a dumb thing and bunch them up into
+# 4, 3, 2 and 1 subsamples
+assign4 = [np.array([0, 1, 2]),
+           np.array([3, 4, 5]),
+           np.array([6, 7, 8]),
+           np.array([9, 10, 11])]
+assign3 = [np.array([0, 1, 2, 3]),
+           np.array([4, 5, 6, 7]),
+           np.array([8, 9, 10, 11])]
+assign2 = [np.array([0, 1, 2, 3, 4, 5]),
+           np.array([6, 7, 8, 9, 10, 11])]
+assign1 = [np.arange(n_bins)]
+
+
+# OK, caclulate S/N for all of these groupings
+snr_d4 = c.get_sn(assign4, full_output=True)
+print('4 bins, S/N = %.1lf' % (snr_d4['snr']))
+snr_d3 = c.get_sn(assign3, full_output=True)
+print('3 bins, S/N = %.1lf' % (snr_d3['snr']))
+snr_d2 = c.get_sn(assign2, full_output=True)
+print('2 bins, S/N = %.1lf' % (snr_d2['snr']))
+snr_d1 = c.get_sn(assign1, full_output=True)
+print('1 bins, S/N = %.1lf' % (snr_d1['snr']))
+
+
+# Note that the calculator also returns the power
+# spectra, if you wanna look at them.
+plt.figure()
+for i in range(4):
+    for j in range(i, 4):
+        if i == j:
+            plt.plot(snr_d4['ls'],
+                     snr_d4['cl'][:, i, j],
+                     'k-')
+            plt.plot(snr_d4['ls'],
+                     snr_d4['nl'][0, i, j]*np.ones_like(snr_d4['ls']), 'k--')
+        else:
+            plt.plot(snr_d4['ls'],
+                     snr_d4['cl'][:, i, j],
+                     '-', c='#AAAAAA')
+plt.loglog()
+plt.show()
diff --git a/the_atlantics/example_minimizer.py b/the_atlantics/example_minimizer.py
new file mode 100644
index 00000000..697980be
--- /dev/null
+++ b/the_atlantics/example_minimizer.py
@@ -0,0 +1,74 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from snc import SnCalc
+from scipy.optimize import minimize
+
+
+# Let's start with a "large" set of 12 samples
+n_bins = 12
+z_cent = np.linspace(0.2, 1.4, n_bins+1)+0.5/n_bins
+sigma_z = 0.05 * (1+z_cent)
+z_arr = np.linspace(0, 2, 1024)
+# Total number of galaxies in each sample
+n_gals = (z_cent/0.5)**2*np.exp(-(z_cent/0.5)**1.5) * 10. * 1.8E4 * 60**2
+
+
+def pz_gau(z, zc, sz):
+    return np.exp(-0.5*((z-zc)/sz)**2)/np.sqrt(2*np.pi*sz**2)
+
+
+# Create the N(z) arrays, which should integrate to the total number
+# of galaxies in each sample.
+nzs = [ng * pz_gau(z_arr, zc, sz)
+       for ng, zc, sz in zip(n_gals, z_cent, sigma_z)]
+
+# Let's plot them
+plt.figure()
+for nz in nzs:
+    plt.plot(z_arr, nz)
+plt.xlabel(r'$z$', fontsize=16)
+plt.ylabel(r'$N(z)$', fontsize=16)
+
+# Now initialize a S/N calculator for these initial groups.
+c = SnCalc(z_arr, nzs)
+
+# OK, let's brute-force explore the total S/N as a function
+# of the bin edge for a 2-bin scenario.
+zs = np.linspace(0.01, 1.5, 128)
+sns = np.array([c.get_sn_from_edges(np.array([z])) for z in zs])
+plt.figure()
+plt.plot(zs, sns)
+plt.xlabel('$z$')
+plt.ylabel('$S/N$')
+
+# Now let's do a 3-bin case (more coarsely-grained so we can do it quickly)
+zs = np.linspace(0.01, 1.5, 32)
+sns = np.array([[c.get_sn_from_edges(np.array([z1, z2])) for z1 in zs]
+                for z2 in zs])
+
+plt.figure()
+plt.imshow(sns, vmin=1440, extent=[0.01, 1.5, 0.01, 1.5],
+           origin='lower')
+plt.xlabel('$z_1$')
+plt.ylabel('$z_2$')
+cb = plt.colorbar()
+cb.set_label('$S/N$')
+
+
+# Finally, let's write the function to minimize and optimize for a 4-bin case.
+def minus_sn(edges, calc):
+    return -calc.get_sn_from_edges(edges)
+
+
+edges_0 = np.array([0.3, 0.6, 0.9])
+res = minimize(minus_sn, edges_0, method='Powell', args=(c,))
+print("WL final edges: ", res.x)
+print("Maximum S/N: ", c.get_sn_from_edges(res.x))
+
+
+# That was for weak lensing. Let's do the same thing for clustering.
+c = SnCalc(z_arr, nzs, use_clustering=True)
+res = minimize(minus_sn, edges_0, method='Powell', args=(c,))
+print("GC final edges: ", res.x)
+print("Maximum S/N: ", c.get_sn_from_edges(res.x))
+plt.show()
diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py
new file mode 100644
index 00000000..d62379d9
--- /dev/null
+++ b/the_atlantics/snc.py
@@ -0,0 +1,259 @@
+import numpy as np
+import pyccl as ccl
+from scipy.integrate import simps
+import os
+
+
+assign_params_default = {'p_inbin_thr': 0.5,
+                         'p_outbin_thr': 0.2,
+                         'use_p_inbin': False,
+                         'use_p_outbin': False}
+
+
+class SnCalc(object):
+    edges_large = 100.
+
+    def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
+                 s_gamma=0.28, use_clustering=False):
+        """ S/N calculator
+
+        Args:
+            z_arr (array_like): array of redshifts at which all the N(z)s are
+                sampled. They should be linearly spaced.
+            nz_list (list): list of arrays containing the redshift
+                distributions of all the initial groups (each array in the
+                list corresponds to one group). The arrays should contain the
+                redshift distribution sampled at `z_arr`, with its integral
+                over `z_arr` equal to the number of galaxies in that group.
+            fsky (float): sky fraction (for SNR calculation and for
+                transforming number of galaxies into number densities).
+            lmax (float): maximum ell.
+            d_ell (float): ell bandpower width.
+            s_gamma (float): rms ellipticity scatter (relevant if
+                `use_clustering=False`).
+            use_clustering (bool): if `True`, SNR will be computed for
+                clustering instead of lensing.
+        """
+        self.s_gamma = s_gamma
+        self.fsky = fsky
+        self.lmax = lmax
+        self.d_ell = d_ell
+        self.larr = np.arange(2, lmax, d_ell)+d_ell/2
+        self.n_ell = len(self.larr)
+        self.n_samples = len(nz_list)
+        self.z_arr = z_arr
+        self.nz_list = nz_list
+        self.n_gals = np.array([simps(nz, x=self.z_arr)
+                                for nz in self.nz_list])
+        self.pz_list = self.nz_list / self.n_gals[:, None]
+        self.z_means = np.array([simps(self.z_arr*nz, x=self.z_arr)
+                                 for nz in self.nz_list]) / self.n_gals
+        self.n_dens = self.n_gals / (4*np.pi*self.fsky)
+        self.cls = None
+        self.use_clustering = use_clustering
+
+    def _bz_model(self, cosmo, z):
+        return 0.95/ccl.growth_factor(cosmo, 1./(1+z))
+
+    def get_cl_matrix(self, fname_save=None, recompute=False):
+        """ Computes matrix of power spectra between all the initial groups.
+
+        Args:
+            fname_save (string): if not None, the result will be saved to a
+                file by this name, and if present and `recompute=False`, the
+                matrix will be read from that file.
+            recompute (bool): if `True`, any previous calculations of the
+                C_ell matrix stored on file will be ignored, and the matrix
+                will be recomputed.
+        """
+        if fname_save is not None:
+            if os.path.isfile(fname_save) and not recompute:
+                self.cls = np.load(fname_save)['cls']
+                return
+
+        # Cosmology
+        cosmo = ccl.CosmologyVanillaLCDM()
+
+        # Tracers
+        if self.use_clustering:
+            trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz),
+                                          bias=(self.z_arr,
+                                                self._bz_model(cosmo,
+                                                               self.z_arr)))
+                   for nz in self.nz_list]
+        else:
+            trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz))
+                   for nz in self.nz_list]
+
+        # Cls
+        self.cls = np.zeros([self.n_ell, self.n_samples, self.n_samples])
+        for i in range(self.n_samples):
+            for j in range(i, self.n_samples):
+                cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr)
+                self.cls[:, i, j] = cl
+                if j != i:
+                    self.cls[:, j, i] = cl
+
+        if fname_save is not None:
+            np.savez(fname_save, cls=self.cls)
+
+    def get_resample_metadata(self, assign):
+        """ Transform an assignment list into a weight matrix and the
+        corresponding number densities.
+
+        Args:
+            assign (list): list of arrays. Each element of the list should
+                be a numpy array of integers, corresponding to the indices
+                of the initial groups that are now resampled into a larger
+                group.
+        """
+        n_resamples = len(assign)
+        weight_res = np.zeros([n_resamples, self.n_samples])
+        n_dens_res = np.zeros(n_resamples)
+        for ia, a in enumerate(assign):
+            ndens = self.n_dens[a]
+            ndens_tot = np.sum(ndens)
+            n_dens_res[ia] = ndens_tot
+            weight_res[ia, a] = ndens / ndens_tot
+        return weight_res, n_dens_res
+
+    def _get_cl_resamples(self, weights):
+        """ Gets C_ell matrix for resampled groups from weights matrix.
+        """
+        if self.cls is None:
+            self.get_cl_matrix()
+        return np.einsum('jl,km,ilm', weights, weights, self.cls)
+
+    def _get_nl_resamples(self, n_dens):
+        """ Gets noise contribution to C_ell matrix for resampled groups
+        from list of number densities.
+        """
+        if self.use_clustering:
+            return np.diag(1./n_dens)[None, :, :]
+        else:
+            return np.diag(self.s_gamma**2/n_dens)[None, :, :]
+
+    def get_sn_wn(self, weights, n_dens, full_output=False):
+        """ Compute signal-to-noise ratio from weights matrix and number
+        densities of the resampled groups.
+
+        Args:
+            weights (array_like): weights matrix of shape
+                `(N_resample, N_initial)`, where `N_resample` is the
+                number of new groups, and `N_initial` is the original
+                number of groups. Each entry corresponds to the weight
+                which with a given initial group enters the new set of
+                groups.
+            n_dens (array_like): number density (in sterad^-1) of the
+                new groups.
+            full_output (bool): if true, a dictionary with additional
+                information will be returned. Otherwise just total S/N.
+
+        Returns:
+            If `full_output=True`, dictionary containing S/N, power
+            spectra and noise spectra. Otherwise just S/N.
+        """
+        sl = self._get_cl_resamples(weights)
+        nl = self._get_nl_resamples(n_dens)
+        cl = sl + nl
+        icl = np.linalg.inv(cl)
+        sn2_1pt = np.sum(sl[:, :, :, None] * icl[:, None, :, :], axis=2)
+        sn2_ell = np.sum(sn2_1pt[:, :, :, None] * sn2_1pt[:, None, :, :],
+                         axis=2)
+        trsn2_ell = np.trace(sn2_ell, axis1=1, axis2=2)
+        snr = np.sqrt(np.sum(trsn2_ell * (2*self.larr + 1) *
+                             self.d_ell / self.fsky))
+
+        if full_output:
+            dret = {'snr': snr, 'cl': sl, 'nl': nl, 'ls': self.larr}
+            return dret
+        else:
+            return snr
+
+    def get_sn(self, assign, full_output=False):
+        """ Compute signal-to-noise ratio from a bin assignment list.
+
+        Args:
+            assign (list): list of arrays. Each element of the list should
+                be a numpy array of integers, corresponding to the indices
+                of the initial groups that are now resampled into a larger
+                group.
+            full_output (bool): if true, a dictionary with additional
+                information will be returned. Otherwise just total S/N.
+
+        Returns:
+            If `full_output=True`, dictionary containing S/N, power
+            spectra and noise spectra. Otherwise just S/N.
+        """
+        w, ndens = self.get_resample_metadata(assign)
+        return self.get_sn_wn(w, ndens,
+                              full_output=full_output)
+
+    def check_edges(self, edges):
+        """ Returns `True` if there's something wrong with the edges.
+        """
+        return np.any(edges < 0) or \
+            np.any(edges > self.edges_large) or \
+            np.any(np.diff(edges) < 0)
+
+    def assign_from_edges(self, edges, assign_params=assign_params_default):
+        """ Get assignment list from edges and assign parameters.
+
+        Args:
+            edges (array_like): array of bin edges.
+            assign_params (dict): dictionary of assignment parameters
+                (see `assign_params_default`).
+
+        Returns:
+            List of assignment arrays ready to be used in e.g. `get_sn`.
+        """
+        nbins = len(edges) + 1
+        # Bin IDs based on mean z
+        ids = np.digitize(self.z_means, bins=edges)
+        if assign_params['use_p_inbin'] or assign_params['use_p_outbin']:
+            # Calculate probabilities in each bin
+            zgroups = np.digitize(self.z_arr, bins=edges)
+            masks = [zgroups == i for i in range(nbins)]
+            if assign_params['use_p_outbin']:
+                # Matrix of probabilities in each bin
+                pzs = np.array([[simps(pz[m], x=self.z_arr[m])
+                                 for m in masks]
+                                for pz in self.pz_list])
+                pzd = np.array([pzs[j, ids[j]]
+                                for j in range(self.n_samples)])
+            else:
+                # Overlaps in own bin
+                pzd = np.array([simps(pz[masks[i]],
+                                      x=self.z_arr[masks[i]])
+                                for pz, i in zip(self.pz_list, ids)])
+
+            # Throw away based on in-bin probability
+            if assign_params['use_p_inbin']:
+                ids[pzd < assign_params['p_inbin_thr']] = -1
+            # Throw away based on off-bin probability
+            if assign_params['use_p_outbin']:
+                ids[np.array([np.sum(p > assign_params['p_outbin_thr']) > 1
+                              for p in pzs])] = -1
+        # Assignment list
+        return [np.where(ids == i)[0] for i in np.unique(ids)]
+
+    def get_sn_from_edges(self, edges, full_output=False,
+                          assign_params=assign_params_default):
+        """ Compute signal-to-noise ratio from a set of edges and assignment
+        parameters.
+
+        Args:
+            edges (array_like): array of bin edges.
+            assign_params (dict): dictionary of assignment parameters
+                (see `assign_params_default`).
+            full_output (bool): if true, a dictionary with additional
+                information will be returned. Otherwise just total S/N.
+
+        Returns:
+            If `full_output=True`, dictionary containing S/N, power
+            spectra and noise spectra. Otherwise just S/N.
+        """
+        if self.check_edges(edges):
+            return 0
+        assign = self.assign_from_edges(edges, assign_params=assign_params)
+        return self.get_sn(assign, full_output=full_output)

From a0317532b78f650d3909bbdd4f63433f2d85c1a8 Mon Sep 17 00:00:00 2001
From: David Alonso <dam.phys@gmail.com>
Date: Sat, 22 Aug 2020 16:36:01 +0100
Subject: [PATCH 02/35] tested for 100 samples

---
 the_atlantics/example_minimizer.py | 38 ++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/the_atlantics/example_minimizer.py b/the_atlantics/example_minimizer.py
index 697980be..8e168c7c 100644
--- a/the_atlantics/example_minimizer.py
+++ b/the_atlantics/example_minimizer.py
@@ -5,7 +5,7 @@
 
 
 # Let's start with a "large" set of 12 samples
-n_bins = 12
+n_bins = 100
 z_cent = np.linspace(0.2, 1.4, n_bins+1)+0.5/n_bins
 sigma_z = 0.05 * (1+z_cent)
 z_arr = np.linspace(0, 2, 1024)
@@ -30,20 +30,31 @@ def pz_gau(z, zc, sz):
 plt.ylabel(r'$N(z)$', fontsize=16)
 
 # Now initialize a S/N calculator for these initial groups.
-c = SnCalc(z_arr, nzs)
-
+c_wl = SnCalc(z_arr, nzs, use_clustering=False)
+c_gc = SnCalc(z_arr, nzs, use_clustering=True)
+print("Initializing WL")
+c_wl.get_cl_matrix(fname_save='wl_nb%d.npz' % n_bins)
+print("Initializing GC")
+c_gc.get_cl_matrix(fname_save='gc_nb%d.npz' % n_bins)
+
+# Let's focus on WL for now.
 # OK, let's brute-force explore the total S/N as a function
 # of the bin edge for a 2-bin scenario.
+print("Exploring 2-bins")
 zs = np.linspace(0.01, 1.5, 128)
-sns = np.array([c.get_sn_from_edges(np.array([z])) for z in zs])
+sns_wl = np.array([c_wl.get_sn_from_edges(np.array([z])) for z in zs])
+sns_gc = np.array([c_gc.get_sn_from_edges(np.array([z])) for z in zs])
 plt.figure()
-plt.plot(zs, sns)
+plt.plot(zs, sns_wl/np.amax(sns_wl), 'r-', label='Lensing')
+plt.plot(zs, sns_gc/np.amax(sns_gc), 'k-', label='Clustering')
 plt.xlabel('$z$')
-plt.ylabel('$S/N$')
+plt.ylabel(r'$(S/N)/(S/N)_{\rm max}$')
+plt.legend(loc='upper left', frameon=False)
 
 # Now let's do a 3-bin case (more coarsely-grained so we can do it quickly)
+print("Exploring 3-bins")
 zs = np.linspace(0.01, 1.5, 32)
-sns = np.array([[c.get_sn_from_edges(np.array([z1, z2])) for z1 in zs]
+sns = np.array([[c_wl.get_sn_from_edges(np.array([z1, z2])) for z1 in zs]
                 for z2 in zs])
 
 plt.figure()
@@ -60,15 +71,16 @@ def minus_sn(edges, calc):
     return -calc.get_sn_from_edges(edges)
 
 
+print("Optimizing WL")
 edges_0 = np.array([0.3, 0.6, 0.9])
-res = minimize(minus_sn, edges_0, method='Powell', args=(c,))
+res = minimize(minus_sn, edges_0, method='Powell', args=(c_wl,))
 print("WL final edges: ", res.x)
-print("Maximum S/N: ", c.get_sn_from_edges(res.x))
-
+print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x))
+print(" ")
 
 # That was for weak lensing. Let's do the same thing for clustering.
-c = SnCalc(z_arr, nzs, use_clustering=True)
-res = minimize(minus_sn, edges_0, method='Powell', args=(c,))
+print("Optimizing GC")
+res = minimize(minus_sn, edges_0, method='Powell', args=(c_gc,))
 print("GC final edges: ", res.x)
-print("Maximum S/N: ", c.get_sn_from_edges(res.x))
+print("Maximum S/N: ", c_gc.get_sn_from_edges(res.x))
 plt.show()

From 95eb643208818e8be45bcd2d7b441f82fa5e25f9 Mon Sep 17 00:00:00 2001
From: Bela Abolfathi <babolfat@uci.edu>
Date: Fri, 28 Aug 2020 12:16:19 -0700
Subject: [PATCH 03/35] Add simple and complex classifiers

---
 tomo_challenge/classifiers/ComplexSOM.py | 352 +++++++++++++++++++++++
 tomo_challenge/classifiers/SimpleSOM.py  | 323 +++++++++++++++++++++
 2 files changed, 675 insertions(+)
 create mode 100644 tomo_challenge/classifiers/ComplexSOM.py
 create mode 100644 tomo_challenge/classifiers/SimpleSOM.py

diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py
new file mode 100644
index 00000000..501617e8
--- /dev/null
+++ b/tomo_challenge/classifiers/ComplexSOM.py
@@ -0,0 +1,352 @@
+"""
+This is an example tomographic bin generator using a random forest.
+
+Every classifier module needs to:
+ - have construction of the type 
+       __init__ (self, bands, options) (see examples below)
+ -  implement two functions: 
+        train (self, training_data,training_z)
+        apply (self, data).
+ - define valid_options class varible.
+
+See Classifier Documentation below.
+"""
+
+from .base import Tomographer
+import numpy as np
+import rpy2
+import rpy2.robjects as ro
+import rpy2.robjects.packages as rpack
+from rpy2.robjects.vectors import StrVector, IntVector, DataFrame, FloatVector, FactorVector
+import pandas as pd
+from rpy2.robjects import pandas2ri
+from rpy2.robjects.conversion import localconverter
+from ..snc import SnCalc 
+from scipy.optimize import minimize
+from scipy.integrate import simps 
+
+#Check that all the needed packages are installed
+# R package nameo
+packnames = ('data.table','itertools','foreach','doParallel','RColorBrewer','devtools','matrixStats')
+base=ro.packages.importr("base")
+utils=ro.packages.importr("utils")
+stats=ro.packages.importr("stats")
+gr=ro.packages.importr("graphics")
+dev=ro.packages.importr("grDevices")
+utils.chooseCRANmirror(ind=1)
+# Selectively install what needs to be installed.
+names_to_install = [x for x in packnames if not rpack.isinstalled(x)]
+if len(names_to_install) > 0:
+    utils.install_packages(StrVector(names_to_install))
+base.Sys_setenv(TAR=base.system("which tar",intern=True))
+devtools=ro.packages.importr("devtools")
+devtools.install_github("AngusWright/helpRfuncs")
+devtools.install_github("AngusWright/kohonen/kohonen")
+kohonen=ro.packages.importr("kohonen")
+
+class ComplexSOM(Tomographer):
+    """ Simplistic SOM Classifier """
+    
+    # valid parameter -- see below
+    valid_options = ['bins','som_dim','num_groups','num_threads',
+            'group_type','data_threshold','sparse_frac','plots','plot_dir','metric']
+    # this settings means arrays will be sent to train and apply instead
+    # of dictionaries
+    wants_arrays = False
+    
+    def __init__ (self, bands, options):
+        """Constructor
+        
+        Parameters:
+        -----------
+        bands: str
+          string containg valid bands, like 'riz' or 'griz'
+        options: dict
+          options come through here. Valid keys are listed as valid_options
+          class variable. 
+
+        Note:
+        -----
+        Valiad options are:
+            'bins' - number of tomographic bins
+            'som_dim' - dimensions of the SOM
+            'num_groups' - number of SOM groups/associations
+            'num_threads' - number of threads to use
+            'sparse_frac' - the sparse-sampling fraction used for training
+            'group_type' - do we want grouping by colour or redshift
+            'data_threshold' - number of threads to use
+
+        """
+        self.bands = bands
+        self.opt = options
+
+    def train (self, training_data, training_z):
+        """Trains the SOM and outputs the resulting bins
+        
+        Parameters:
+        -----------
+        training_data: numpy array, size Ngalaxes x Nbands
+          training data, each row is a galaxy, each column is a band as per
+          band defined above
+        training_z: numpy array, size Ngalaxies
+          true redshift for the training sample
+
+        """
+
+        print("Initialising")
+        #Number of tomographic bins 
+        n_bin = self.opt['bins']
+        #Dimensions of the SOM
+        som_dim = self.opt['som_dim'] 
+        #Number of discrete SOM groups
+        num_groups = self.opt['num_groups'] 
+        #Number of threads 
+        num_threads = self.opt['num_threads'] 
+        #Sparse Frac 
+        sparse_frac = self.opt['sparse_frac'] 
+        #Data Threshold
+        data_threshold = self.opt['data_threshold']
+        #Metric
+        metric = self.opt['metric']
+        #Plots
+        plots = self.opt['plots']
+        #Plot Output Directory
+        plot_dir = self.opt['plot_dir']
+        #Group Type
+        group_type = self.opt['group_type']
+        #Check that the group type is a valid choice
+        if group_type != 'colour' and group_type != 'redshift':
+            group_type = 'redshift'
+
+        #Define the redshift summary statistics (used for making groups in the 'redshift' case
+        property_labels = ("mean_z_true","med_z_true","sd_z_true","mad_z_true","N","iqr_z_true")
+        property_expressions = ("mean(data$redshift_true)","median(data$redshift_true)","sd(data$redshift_true)",
+                                "mad(data$redshift_true)","nrow(data)",
+                                "diff(quantile(data$redshift_true,probs=pnorm(c(-2,2))))")
+        #Define the SOM variables
+        if self.bands == 'riz':
+            #riz bands
+            #expressions = ("r_mag-i_mag","r_mag-z_mag","i_mag-z_mag",
+            #               "z_mag","r_mag-i_mag-(i_mag-z_mag)")
+            expressions = ("r-i","r-z","i-z",
+                           "z","r-i-(i-z)")
+        elif self.bands == 'griz':
+            #griz bands
+            #expressions = ("g_mag-r_mag","g_mag-i_mag",
+            #               "g_mag-z_mag","r_mag-i_mag","r_mag-z_mag","i_mag-z_mag",
+            #               "z_mag","g_mag-r_mag-(r_mag-i_mag)",
+            #               "r_mag-i_mag-(i_mag-z_mag)")
+            expressions = ("g-r","g-i",
+                           "g-z","r-i","r-z","i-z",
+                           "z","g-r-(r-i)",
+                           "r-i-(i-z)")
+        elif self.bands == 'grizy':
+            #grizy bands
+            #expressions = ("g_mag-r_mag","g_mag-i_mag",
+            #               "g_mag-z_mag","g_mag-y_mag","r_mag-i_mag","r_mag-z_mag","r_mag-y_mag","i_mag-z_mag","i_mag-y_mag",
+            #               "z_mag-y_mag","z_mag","g_mag-r_mag-(r_mag-i_mag)",
+            #               "r_mag-i_mag-(i_mag-z_mag)","i_mag-z_mag-(z_mag-y_mag)")
+            expressions = ("g-r","g-i",
+                           "g-z","g-y","r-i","r-z","r-y","i-z","i-y",
+                           "z-y","z","g-r-(r-i)",
+                           "r-i-(i-z)","i-z-(z-y)")
+        elif self.bands == 'ugriz':
+            #ugrizy bands
+            #expressions = ("u_mag-g_mag","u_mag-r_mag","u_mag-i_mag","u_mag-z_mag","g_mag-r_mag","g_mag-i_mag",
+            #               "g_mag-z_mag","r_mag-i_mag","r_mag-z_mag","i_mag-z_mag",
+            #               "z_mag","u_mag-g_mag-(g_mag-r_mag)","g_mag-r_mag-(r_mag-i_mag)",
+            #               "r_mag-i_mag-(i_mag-z_mag)")
+            expressions = ("u-g","u-r","u-i","u-z","g-r","g-i",
+                           "g-z","r-i","r-z","i-z",
+                           "z","u-g-(g-r)","g-r-(r-i)",
+                           "r-i-(i-z)")
+        elif self.bands == 'ugrizy':
+            #ugrizy bands
+            #expressions = ("u_mag-g_mag","u_mag-r_mag","u_mag-i_mag","u_mag-z_mag","u_mag-y_mag","g_mag-r_mag","g_mag-i_mag",
+            #               "g_mag-z_mag","g_mag-y_mag","r_mag-i_mag","r_mag-z_mag","r_mag-y_mag","i_mag-z_mag","i_mag-y_mag",
+            #               "z_mag-y_mag","z_mag","u_mag-g_mag-(g_mag-r_mag)","g_mag-r_mag-(r_mag-i_mag)",
+            #               "r_mag-i_mag-(i_mag-z_mag)","i_mag-z_mag-(z_mag-y_mag)")
+            expressions = ("u-g","u-r","u-i","u-z","u-y","g-r","g-i",
+                           "g-z","g-y","r-i","r-z","r-y","i-z","i-y",
+                           "z-y","z","u-g-(g-r)","g-r-(r-i)",
+                           "r-i-(i-z)","i-z-(z-y)")
+
+        print("Preparing the data")
+        training_data = pd.DataFrame.from_dict(training_data)
+        #Add the redshift variable to the train data
+        print("Adding redshift info to training data")
+        training_data['redshift_true'] = training_z
+
+        if sparse_frac < 1:
+            print("Sparse Sampling the training data")
+            cut = np.random.uniform(0, 1, training_z.size) < sparse_frac
+            training_data = training_data[cut]
+            training_z = training_z[cut]
+
+        #Construct the training data frame (just a python-to-R data conversion)
+        print("Converting the data to R format")
+        with localconverter(ro.default_converter + pandas2ri.converter):
+              #train_df = ro.conversion.py2rpy(train[['u_mag','g_mag','r_mag','i_mag','z_mag','y_mag']])
+              train_df = ro.conversion.py2rpy(training_data)
+
+        print("Training the SOM using R kohtrain")
+        #Train the SOM using R kohtrain
+        som=kohonen.kohtrain(data=train_df,som_dim=IntVector(som_dim),max_na_frac=0,data_threshold=FloatVector(data_threshold),
+                    n_cores=num_threads,train_expr=StrVector(expressions),train_sparse=False,sparse_frac=sparse_frac)
+
+        #If grouping by redshift, construct the cell redshift statistics
+        if group_type == 'redshift' or plots == True:
+            print("Constructing cell-based redshift properties")
+            #Construct the Nz properties per SOM cell
+            cell_prop=kohonen.generate_kohgroup_property(som=som,data=train_df,
+                        expression=StrVector(property_expressions),expr_label=StrVector(property_labels))
+            print("Constructing redshift-based hierarchical cluster tree")
+            #Cluster the SOM cells into num_groups groups
+            hclust=stats.hclust(stats.dist(cell_prop.rx2['property']))
+            cell_group=stats.cutree(hclust,k=num_groups)
+
+            #Assign the cell groups to the SOM structure
+            som.rx2['hclust']=hclust
+            som.rx2['cell_clust']=cell_group
+
+        #Construct the Nz properties per SOM group
+        print("Constructing group-based redshift properties")
+        group_prop=kohonen.generate_kohgroup_property(som=som,data=train_df,
+            expression=StrVector(property_expressions),expr_label=StrVector(property_labels),
+            n_cluster_bins=num_groups)
+
+        #extract the training som (just for convenience)
+        train_som = group_prop.rx2('som')
+
+        if plots == True: 
+            #Make the diagnostic plots
+            props = group_prop.rx2('property')
+            print(base.colnames(props))
+            print("Constructing SOM diagnostic figures")
+            #Initialise the plot device
+            dev.png(file=f'{plot_dir}/SOMfig_%02d.png',height=5,width=5,res=220,unit='in')
+            #Paint the SOM by the training properties
+            for i in range(len(expressions)): 
+                gr.plot(train_som,property=i+1,shape='straight',heatkeywidth=som_dim[0]/20,ncolors=1e3,zlim=FloatVector([-3,3]))
+            #Plot the standard diagnostics:
+            #Count
+            gr.plot(train_som,type='count',shape='straight',heatkeywidth=som_dim[0]/20,zlog=True,ncolors=1e3)
+            #Quality
+            gr.plot(train_som,type='quality',shape='straight',heatkeywidth=som_dim[0]/20,zlim=FloatVector([0,0.5]),ncolors=1e3)
+            #UMatrix
+            gr.plot(train_som,type='dist',shape='straight',heatkeywidth=som_dim[0]/20,zlog=True,zlim=FloatVector([0,0.5]),ncolors=1e3)
+            #Sometimes these figures need customised limits; the "try" stops bad plots from stopping the code
+             
+            #Paint by the redshift diagnostics:
+            #mean redshift
+            gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='mean_z_true')),ncolors=1e3,zlog=False,
+                    type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group mean redshift',zlim=FloatVector([0,1.8]))
+            #redshift std
+            gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='sd_z_true')),ncolors=1e3,zlog=False,
+                    type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group redshift stdev',zlim=FloatVector([0,0.2]))
+            #2sigma redshift IQR
+            gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='iqr_z_true')),ncolors=1e3,zlog=False,
+                    type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='group 2sigma IQR',zlim=FloatVector([0,0.4]))
+            #N group
+            gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='N')),ncolors=1e3,zlog=False,
+                    type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='N group')
+
+            #Close the plot device 
+            dev.dev_off()
+
+        print("Constructing the per-group Nz")
+        #Get the group assignments for the training data 
+        train_group = np.array(train_som.rx2['clust.classif'])
+        tab=base.table(FactorVector(train_som.rx2['clust.classif'],levels=base.seq(num_groups)))
+        #Construct the Nz z-axis
+        z_arr = np.linspace(0, 2, 1024)
+        z_cen = z_arr[0:-1]+(z_arr[1]-z_arr[0])/2. 
+        #Construct the per-group Nz
+        nzs = [(np.histogram(training_z[train_group == group+1],z_arr)[0]) for group in np.arange(num_groups)[np.array(tab)!=0]]
+        # Now initialize a S/N calculator for these initial groups.
+        if metric == 'SNR_ww': 
+            c_wl = SnCalc(z_cen, nzs, use_clustering=False)
+        else:
+            c_wl = SnCalc(z_cen, nzs, use_clustering=True)
+        print("Initializing WL")
+        c_wl.get_cl_matrix(fname_save='wl_nb%d.npz' % n_bin)
+        # Finally, let's write the function to minimize and optimize for a 4-bin case.
+        def minus_sn(edges, calc):
+            return -calc.get_sn_from_edges(edges)
+
+        print("Optimizing WL")
+        edges_0 = np.linspace(0, 2, n_bin-1)
+        res = minimize(minus_sn, edges_0, method='Powell', args=(c_wl,))
+        print("WL final edges: ", res.x)
+        print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x))
+        print(" ")
+
+
+
+        #Construct the per-group Nz
+        print("Outputting trained SOM and redshift ordering of SOM groups")
+
+        print("Finished")
+        self.train_som = train_som
+        self.group_z = group_z
+        self.z_order = z_order
+
+    def apply (self, data):
+        """Applies training to the data.
+        
+        Parameters:
+        -----------
+        Data: numpy array, size Ngalaxes x Nbands
+          testing data, each row is a galaxy, each column is a band as per
+          band defined above
+
+        Returns: 
+        tomographic_selections: numpy array, int, size Ngalaxies
+          tomographic selection for galaxies return as bin number for 
+          each galaxy.
+        """
+        
+        #Number of tomographic bins 
+        n_bin = self.opt['bins']
+
+        print("Preparing the data")
+        data = pd.DataFrame.from_dict(data)
+
+        #Construct the validation data frame (just a python-to-R data conversion)
+        print("Converting the data to R format")
+        with localconverter(ro.default_converter + pandas2ri.converter):
+              #train_df = ro.conversion.py2rpy(train[['u_mag','g_mag','r_mag','i_mag','z_mag','y_mag']])
+              data_df = ro.conversion.py2rpy(data)
+
+        print("Parsing the validation data into the SOM groupings")
+        #Generate the validation associations/groups
+        group_prop=kohonen.generate_kohgroup_property(som=self.train_som,data=data_df,
+            expression="nrow(data)",expr_label="N",
+            n_cluster_bins=self.opt['num_groups'],n_cores=self.opt['num_threads'])
+
+        #Calculate the cumulative count 
+        print("Generate cumulative source counts a.f.o. group mean z")
+        zcumsum=base.cumsum(group_prop.rx2('property').rx(self.z_order))
+
+        # Find the edges that split the redshifts into n_z bins of
+        # equal number counts in each
+        print("Assign the groups to tomographic bins")
+        p = np.linspace(0, 100, n_bin + 1)
+        n_edges = np.percentile(zcumsum, p)
+
+        # Now put the groups into redshift bins.
+        group_bins = FloatVector(base.cut(FloatVector(zcumsum),FloatVector(n_edges),
+            include=True)).rx(base.order(self.z_order))
+
+        #extract the validation som (just for convenience)
+        valid_som = group_prop.rx2('som')
+
+        #Assign the sources, by group, to tomographic bins
+        print("Output source tomographic bin assignments")
+        valid_bin = base.unsplit(group_bins,FactorVector(valid_som.rx2['clust.classif'],
+            levels=base.seq(self.opt['num_groups'])),drop=False)
+        #valid_bin = valid_som.rx2['clust.classif']
+        valid_bin = np.array(valid_bin)-1
+
+        return valid_bin
+
diff --git a/tomo_challenge/classifiers/SimpleSOM.py b/tomo_challenge/classifiers/SimpleSOM.py
new file mode 100644
index 00000000..f3b0b74c
--- /dev/null
+++ b/tomo_challenge/classifiers/SimpleSOM.py
@@ -0,0 +1,323 @@
+"""
+This is an example tomographic bin generator using a random forest.
+
+Every classifier module needs to:
+ - have construction of the type 
+       __init__ (self, bands, options) (see examples below)
+ -  implement two functions: 
+        train (self, training_data,training_z)
+        apply (self, data).
+ - define valid_options class varible.
+
+See Classifier Documentation below.
+"""
+
+from .base import Tomographer
+import numpy as np
+import rpy2
+import rpy2.robjects as ro
+import rpy2.robjects.packages as rpack
+from rpy2.robjects.vectors import StrVector, IntVector, DataFrame, FloatVector, FactorVector
+import pandas as pd
+from rpy2.robjects import pandas2ri
+from rpy2.robjects.conversion import localconverter
+
+#Check that all the needed packages are installed
+# R package nameo
+packnames = ('data.table','itertools','foreach','doParallel','RColorBrewer','devtools','matrixStats')
+base=ro.packages.importr("base")
+utils=ro.packages.importr("utils")
+stats=ro.packages.importr("stats")
+gr=ro.packages.importr("graphics")
+dev=ro.packages.importr("grDevices")
+utils.chooseCRANmirror(ind=1)
+# Selectively install what needs to be installed.
+names_to_install = [x for x in packnames if not rpack.isinstalled(x)]
+if len(names_to_install) > 0:
+    utils.install_packages(StrVector(names_to_install))
+base.Sys_setenv(TAR=base.system("which tar",intern=True))
+devtools=ro.packages.importr("devtools")
+devtools.install_github("AngusWright/helpRfuncs")
+devtools.install_github("AngusWright/kohonen/kohonen")
+kohonen=ro.packages.importr("kohonen")
+
+class SimpleSOM(Tomographer):
+    """ Simplistic SOM Classifier """
+    
+    # valid parameter -- see below
+    valid_options = ['bins','som_dim','num_groups','num_threads',
+            'group_type','data_threshold','sparse_frac','plots','plot_dir']
+    # this settings means arrays will be sent to train and apply instead
+    # of dictionaries
+    wants_arrays = False
+    
+    def __init__ (self, bands, options):
+        """Constructor
+        
+        Parameters:
+        -----------
+        bands: str
+          string containg valid bands, like 'riz' or 'griz'
+        options: dict
+          options come through here. Valid keys are listed as valid_options
+          class variable. 
+
+        Note:
+        -----
+        Valiad options are:
+            'bins' - number of tomographic bins
+            'som_dim' - dimensions of the SOM
+            'num_groups' - number of SOM groups/associations
+            'num_threads' - number of threads to use
+            'sparse_frac' - the sparse-sampling fraction used for training
+            'group_type' - do we want grouping by colour or redshift
+            'data_threshold' - number of threads to use
+
+        """
+        self.bands = bands
+        self.opt = options
+
+    def train (self, training_data, training_z):
+        """Trains the SOM and outputs the resulting bins
+        
+        Parameters:
+        -----------
+        training_data: numpy array, size Ngalaxes x Nbands
+          training data, each row is a galaxy, each column is a band as per
+          band defined above
+        training_z: numpy array, size Ngalaxies
+          true redshift for the training sample
+
+        """
+
+        print("Initialising")
+        #Number of tomographic bins 
+        n_bin = self.opt['bins']
+        #Dimensions of the SOM
+        som_dim = self.opt['som_dim'] 
+        #Number of discrete SOM groups
+        num_groups = self.opt['num_groups'] 
+        #Number of threads 
+        num_threads = self.opt['num_threads'] 
+        #Sparse Frac 
+        sparse_frac = self.opt['sparse_frac'] 
+        #Data Threshold
+        data_threshold = self.opt['data_threshold']
+        #Plots
+        plots = self.opt['plots']
+        #Plot Output Directory
+        plot_dir = self.opt['plot_dir']
+        #Group Type
+        group_type = self.opt['group_type']
+        #Check that the group type is a valid choice
+        if group_type != 'colour' and group_type != 'redshift':
+            group_type = 'redshift'
+
+        #Define the redshift summary statistics (used for making groups in the 'redshift' case
+        property_labels = ("mean_z_true","med_z_true","sd_z_true","mad_z_true","N","iqr_z_true")
+        property_expressions = ("mean(data$redshift_true)","median(data$redshift_true)","sd(data$redshift_true)",
+                                "mad(data$redshift_true)","nrow(data)",
+                                "diff(quantile(data$redshift_true,probs=pnorm(c(-2,2))))")
+        #Define the SOM variables
+        if self.bands == 'riz':
+            #riz bands
+            #expressions = ("r_mag-i_mag","r_mag-z_mag","i_mag-z_mag",
+            #               "z_mag","r_mag-i_mag-(i_mag-z_mag)")
+            expressions = ("r-i","r-z","i-z",
+                           "z","r-i-(i-z)")
+        elif self.bands == 'griz':
+            #griz bands
+            #expressions = ("g_mag-r_mag","g_mag-i_mag",
+            #               "g_mag-z_mag","r_mag-i_mag","r_mag-z_mag","i_mag-z_mag",
+            #               "z_mag","g_mag-r_mag-(r_mag-i_mag)",
+            #               "r_mag-i_mag-(i_mag-z_mag)")
+            expressions = ("g-r","g-i",
+                           "g-z","r-i","r-z","i-z",
+                           "z","g-r-(r-i)",
+                           "r-i-(i-z)")
+        elif self.bands == 'grizy':
+            #grizy bands
+            #expressions = ("g_mag-r_mag","g_mag-i_mag",
+            #               "g_mag-z_mag","g_mag-y_mag","r_mag-i_mag","r_mag-z_mag","r_mag-y_mag","i_mag-z_mag","i_mag-y_mag",
+            #               "z_mag-y_mag","z_mag","g_mag-r_mag-(r_mag-i_mag)",
+            #               "r_mag-i_mag-(i_mag-z_mag)","i_mag-z_mag-(z_mag-y_mag)")
+            expressions = ("g-r","g-i",
+                           "g-z","g-y","r-i","r-z","r-y","i-z","i-y",
+                           "z-y","z","g-r-(r-i)",
+                           "r-i-(i-z)","i-z-(z-y)")
+        elif self.bands == 'ugriz':
+            #ugrizy bands
+            #expressions = ("u_mag-g_mag","u_mag-r_mag","u_mag-i_mag","u_mag-z_mag","g_mag-r_mag","g_mag-i_mag",
+            #               "g_mag-z_mag","r_mag-i_mag","r_mag-z_mag","i_mag-z_mag",
+            #               "z_mag","u_mag-g_mag-(g_mag-r_mag)","g_mag-r_mag-(r_mag-i_mag)",
+            #               "r_mag-i_mag-(i_mag-z_mag)")
+            expressions = ("u-g","u-r","u-i","u-z","g-r","g-i",
+                           "g-z","r-i","r-z","i-z",
+                           "z","u-g-(g-r)","g-r-(r-i)",
+                           "r-i-(i-z)")
+        elif self.bands == 'ugrizy':
+            #ugrizy bands
+            #expressions = ("u_mag-g_mag","u_mag-r_mag","u_mag-i_mag","u_mag-z_mag","u_mag-y_mag","g_mag-r_mag","g_mag-i_mag",
+            #               "g_mag-z_mag","g_mag-y_mag","r_mag-i_mag","r_mag-z_mag","r_mag-y_mag","i_mag-z_mag","i_mag-y_mag",
+            #               "z_mag-y_mag","z_mag","u_mag-g_mag-(g_mag-r_mag)","g_mag-r_mag-(r_mag-i_mag)",
+            #               "r_mag-i_mag-(i_mag-z_mag)","i_mag-z_mag-(z_mag-y_mag)")
+            expressions = ("u-g","u-r","u-i","u-z","u-y","g-r","g-i",
+                           "g-z","g-y","r-i","r-z","r-y","i-z","i-y",
+                           "z-y","z","u-g-(g-r)","g-r-(r-i)",
+                           "r-i-(i-z)","i-z-(z-y)")
+
+        print("Preparing the data")
+        training_data = pd.DataFrame.from_dict(training_data)
+        #Add the redshift variable to the train data
+        print("Adding redshift info to training data")
+        training_data['redshift_true'] = training_z
+
+        if sparse_frac < 1:
+            print("Sparse Sampling the training data")
+            cut = np.random.uniform(0, 1, training_z.size) < sparse_frac
+            training_data = training_data[cut]
+
+        #Construct the training data frame (just a python-to-R data conversion)
+        print("Converting the data to R format")
+        with localconverter(ro.default_converter + pandas2ri.converter):
+              #train_df = ro.conversion.py2rpy(train[['u_mag','g_mag','r_mag','i_mag','z_mag','y_mag']])
+              train_df = ro.conversion.py2rpy(training_data)
+
+        print("Training the SOM using R kohtrain")
+        #Train the SOM using R kohtrain
+        som=kohonen.kohtrain(data=train_df,som_dim=IntVector(som_dim),max_na_frac=0,data_threshold=FloatVector(data_threshold),
+                    n_cores=num_threads,train_expr=StrVector(expressions),train_sparse=False,sparse_frac=sparse_frac)
+
+        #If grouping by redshift, construct the cell redshift statistics
+        if group_type == 'redshift' or plots == True:
+            print("Constructing cell-based redshift properties")
+            #Construct the Nz properties per SOM cell
+            cell_prop=kohonen.generate_kohgroup_property(som=som,data=train_df,
+                        expression=StrVector(property_expressions),expr_label=StrVector(property_labels))
+            print("Constructing redshift-based hierarchical cluster tree")
+            #Cluster the SOM cells into num_groups groups
+            hclust=stats.hclust(stats.dist(cell_prop.rx2['property']))
+            cell_group=stats.cutree(hclust,k=num_groups)
+
+            #Assign the cell groups to the SOM structure
+            som.rx2['hclust']=hclust
+            som.rx2['cell_clust']=cell_group
+
+        #Construct the Nz properties per SOM group
+        print("Constructing group-based redshift properties")
+        group_prop=kohonen.generate_kohgroup_property(som=som,data=train_df,
+            expression=StrVector(property_expressions),expr_label=StrVector(property_labels),
+            n_cluster_bins=num_groups)
+
+        #extract the training som (just for convenience)
+        train_som = group_prop.rx2('som')
+
+        if plots == True: 
+            #Make the diagnostic plots
+            props = group_prop.rx2('property')
+            print(base.colnames(props))
+            print("Constructing SOM diagnostic figures")
+            #Initialise the plot device
+            dev.png(file=f'{plot_dir}/SOMfig_%02d.png',height=5,width=5,res=220,unit='in')
+            #Paint the SOM by the training properties
+            for i in range(len(expressions)): 
+                gr.plot(train_som,property=i+1,shape='straight',heatkeywidth=som_dim[0]/20,ncolors=1e3,zlim=FloatVector([-3,3]))
+            #Plot the standard diagnostics:
+            #Count
+            gr.plot(train_som,type='count',shape='straight',heatkeywidth=som_dim[0]/20,zlog=True,ncolors=1e3)
+            #Quality
+            gr.plot(train_som,type='quality',shape='straight',heatkeywidth=som_dim[0]/20,zlim=FloatVector([0,0.5]),ncolors=1e3)
+            #UMatrix
+            gr.plot(train_som,type='dist',shape='straight',heatkeywidth=som_dim[0]/20,zlog=True,zlim=FloatVector([0,0.5]),ncolors=1e3)
+            #Sometimes these figures need customised limits; the "try" stops bad plots from stopping the code
+             
+            #Paint by the redshift diagnostics:
+            #mean redshift
+            gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='mean_z_true')),ncolors=1e3,zlog=False,
+                    type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group mean redshift',zlim=FloatVector([0,1.8]))
+            #redshift std
+            gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='sd_z_true')),ncolors=1e3,zlog=False,
+                    type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group redshift stdev',zlim=FloatVector([0,0.2]))
+            #2sigma redshift IQR
+            gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='iqr_z_true')),ncolors=1e3,zlog=False,
+                    type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='group 2sigma IQR',zlim=FloatVector([0,0.4]))
+            #N group
+            gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='N')),ncolors=1e3,zlog=False,
+                    type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='N group')
+
+            #Close the plot device 
+            dev.dev_off()
+
+
+
+        print("Outputting trained SOM and redshift ordering of SOM groups")
+        #Extract the mean-z per group
+        group_z = group_prop.rx2('property').rx(True,
+                base.which(group_prop.rx2('property').colnames.ro=='mean_z_true'))
+        #Order the groups by mean z
+        z_order = base.order(group_z)
+
+        print("Finished")
+        self.train_som = train_som
+        self.group_z = group_z
+        self.z_order = z_order
+
+    def apply (self, data):
+        """Applies training to the data.
+        
+        Parameters:
+        -----------
+        Data: numpy array, size Ngalaxes x Nbands
+          testing data, each row is a galaxy, each column is a band as per
+          band defined above
+
+        Returns: 
+        tomographic_selections: numpy array, int, size Ngalaxies
+          tomographic selection for galaxies return as bin number for 
+          each galaxy.
+        """
+        
+        #Number of tomographic bins 
+        n_bin = self.opt['bins']
+
+        print("Preparing the data")
+        data = pd.DataFrame.from_dict(data)
+
+        #Construct the validation data frame (just a python-to-R data conversion)
+        print("Converting the data to R format")
+        with localconverter(ro.default_converter + pandas2ri.converter):
+              #train_df = ro.conversion.py2rpy(train[['u_mag','g_mag','r_mag','i_mag','z_mag','y_mag']])
+              data_df = ro.conversion.py2rpy(data)
+
+        print("Parsing the validation data into the SOM groupings")
+        #Generate the validation associations/groups
+        group_prop=kohonen.generate_kohgroup_property(som=self.train_som,data=data_df,
+            expression="nrow(data)",expr_label="N",
+            n_cluster_bins=self.opt['num_groups'],n_cores=self.opt['num_threads'])
+
+        #Calculate the cumulative count 
+        print("Generate cumulative source counts a.f.o. group mean z")
+        zcumsum=base.cumsum(group_prop.rx2('property').rx(self.z_order))
+
+        # Find the edges that split the redshifts into n_z bins of
+        # equal number counts in each
+        print("Assign the groups to tomographic bins")
+        p = np.linspace(0, 100, n_bin + 1)
+        n_edges = np.percentile(zcumsum, p)
+
+        # Now put the groups into redshift bins.
+        group_bins = FloatVector(base.cut(FloatVector(zcumsum),FloatVector(n_edges),
+            include=True)).rx(base.order(self.z_order))
+
+        #extract the validation som (just for convenience)
+        valid_som = group_prop.rx2('som')
+
+        #Assign the sources, by group, to tomographic bins
+        print("Output source tomographic bin assignments")
+        valid_bin = base.unsplit(group_bins,FactorVector(valid_som.rx2['clust.classif'],
+            levels=base.seq(self.opt['num_groups'])),drop=False)
+        #valid_bin = valid_som.rx2['clust.classif']
+        valid_bin = np.array(valid_bin)-1
+
+        return valid_bin
+

From 57f15155e6acbc3710e50b6724e88c16ae9d40b4 Mon Sep 17 00:00:00 2001
From: Bela Abolfathi <babolfat@uci.edu>
Date: Fri, 28 Aug 2020 12:21:45 -0700
Subject: [PATCH 04/35] Added simple and complex .yaml

---
 example/ComplexSOM.yaml | 29 +++++++++++++++++++++++++++++
 example/SimpleSOM.yaml  | 28 ++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)
 create mode 100644 example/ComplexSOM.yaml
 create mode 100644 example/SimpleSOM.yaml

diff --git a/example/ComplexSOM.yaml b/example/ComplexSOM.yaml
new file mode 100644
index 00000000..63a97e3e
--- /dev/null
+++ b/example/ComplexSOM.yaml
@@ -0,0 +1,29 @@
+metrics:  [SNR_3x2]
+bands: riz
+training_file: data/training.hdf5
+validation_file: data/validation_small.hdf5
+output_file: example/example_output.txt
+# Backend implementing the metrics, either: "firecrown" (default), "jax-cosmo"
+metrics_impl: firecrown
+
+run:
+  # This is a class name which will be looked up
+  ComplexSOM:
+    run_3:
+      # These settings are sent to the classifier
+      bins: 5
+      som_dim: [21,21]
+      num_groups: 100
+      num_threads: 32
+      group_type: 'redshift'
+      data_threshold: [0,30]
+      sparse_frac: 0.05
+      plots: True 
+      plot_dir: 'plots' 
+      metric: "SNR_3x2"
+      # These special settings decide whether the
+      # color and error colums are passed to the classifier
+      # as well as the magnitudes
+      colors: False
+      errors: False
+
diff --git a/example/SimpleSOM.yaml b/example/SimpleSOM.yaml
new file mode 100644
index 00000000..6f261f85
--- /dev/null
+++ b/example/SimpleSOM.yaml
@@ -0,0 +1,28 @@
+metrics:  [SNR_3x2]
+bands: riz
+training_file: data/training.hdf5
+validation_file: data/validation_small.hdf5
+output_file: example/example_output.txt
+# Backend implementing the metrics, either: "firecrown" (default), "jax-cosmo"
+metrics_impl: firecrown
+
+run:
+  # This is a class name which will be looked up
+  SimpleSOM:
+    run_3:
+      # These settings are sent to the classifier
+      bins: 10
+      som_dim: [151,151]
+      num_groups: 200
+      num_threads: 32
+      group_type: 'redshift'
+      data_threshold: [0,30]
+      sparse_frac: 0.05
+      plots: True 
+      plot_dir: 'plots' 
+      # These special settings decide whether the
+      # color and error colums are passed to the classifier
+      # as well as the magnitudes
+      colors: False
+      errors: False
+

From a6465c1b0f3b6ca9bc062b980a257da268d7da60 Mon Sep 17 00:00:00 2001
From: Bela Abolfathi <babolfat@uci.edu>
Date: Fri, 28 Aug 2020 12:23:16 -0700
Subject: [PATCH 05/35] Add updated snc.py

---
 the_atlantics/snc.py | 90 ++++++++++++++++++++------------------------
 1 file changed, 41 insertions(+), 49 deletions(-)

diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py
index d62379d9..27aa0c09 100644
--- a/the_atlantics/snc.py
+++ b/the_atlantics/snc.py
@@ -5,20 +5,19 @@
 
 
 assign_params_default = {'p_inbin_thr': 0.5,
-                         'p_outbin_thr': 0.2,
-                         'use_p_inbin': False,
-                         'use_p_outbin': False}
+        'p_outbin_thr': 0.2,
+        'use_p_inbin': False,
+        'use_p_outbin': False}
 
 
 class SnCalc(object):
     edges_large = 100.
 
     def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
-                 s_gamma=0.28, use_clustering=False):
+            s_gamma=0.28, use_clustering=False):
         """ S/N calculator
-
         Args:
-            z_arr (array_like): array of redshifts at which all the N(z)s are
+                        z_arr (array_like): array of redshifts at which all the N(z)s are
                 sampled. They should be linearly spaced.
             nz_list (list): list of arrays containing the redshift
                 distributions of all the initial groups (each array in the
@@ -44,10 +43,10 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
         self.z_arr = z_arr
         self.nz_list = nz_list
         self.n_gals = np.array([simps(nz, x=self.z_arr)
-                                for nz in self.nz_list])
+            for nz in self.nz_list])
         self.pz_list = self.nz_list / self.n_gals[:, None]
         self.z_means = np.array([simps(self.z_arr*nz, x=self.z_arr)
-                                 for nz in self.nz_list]) / self.n_gals
+            for nz in self.nz_list]) / self.n_gals
         self.n_dens = self.n_gals / (4*np.pi*self.fsky)
         self.cls = None
         self.use_clustering = use_clustering
@@ -57,9 +56,8 @@ def _bz_model(self, cosmo, z):
 
     def get_cl_matrix(self, fname_save=None, recompute=False):
         """ Computes matrix of power spectra between all the initial groups.
-
         Args:
-            fname_save (string): if not None, the result will be saved to a
+                        fname_save (string): if not None, the result will be saved to a
                 file by this name, and if present and `recompute=False`, the
                 matrix will be read from that file.
             recompute (bool): if `True`, any previous calculations of the
@@ -72,20 +70,23 @@ def get_cl_matrix(self, fname_save=None, recompute=False):
                 return
 
         # Cosmology
-        cosmo = ccl.CosmologyVanillaLCDM()
+        cosmo = ccl.Cosmology(Omega_c=0.25,
+                Omega_b=0.05,
+                h=0.67, n_s=0.96,
+                sigma8=0.81)
 
         # Tracers
         if self.use_clustering:
             trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz),
-                                          bias=(self.z_arr,
-                                                self._bz_model(cosmo,
-                                                               self.z_arr)))
-                   for nz in self.nz_list]
+                bias=(self.z_arr,
+                    self._bz_model(cosmo,
+                        self.z_arr)))
+                    for nz in self.nz_list]
         else:
             trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz))
-                   for nz in self.nz_list]
+                    for nz in self.nz_list]
 
-        # Cls
+            # Cls
         self.cls = np.zeros([self.n_ell, self.n_samples, self.n_samples])
         for i in range(self.n_samples):
             for j in range(i, self.n_samples):
@@ -100,9 +101,8 @@ def get_cl_matrix(self, fname_save=None, recompute=False):
     def get_resample_metadata(self, assign):
         """ Transform an assignment list into a weight matrix and the
         corresponding number densities.
-
         Args:
-            assign (list): list of arrays. Each element of the list should
+                        assign (list): list of arrays. Each element of the list should
                 be a numpy array of integers, corresponding to the indices
                 of the initial groups that are now resampled into a larger
                 group.
@@ -136,9 +136,8 @@ def _get_nl_resamples(self, n_dens):
     def get_sn_wn(self, weights, n_dens, full_output=False):
         """ Compute signal-to-noise ratio from weights matrix and number
         densities of the resampled groups.
-
         Args:
-            weights (array_like): weights matrix of shape
+                        weights (array_like): weights matrix of shape
                 `(N_resample, N_initial)`, where `N_resample` is the
                 number of new groups, and `N_initial` is the original
                 number of groups. Each entry corresponds to the weight
@@ -148,9 +147,8 @@ def get_sn_wn(self, weights, n_dens, full_output=False):
                 new groups.
             full_output (bool): if true, a dictionary with additional
                 information will be returned. Otherwise just total S/N.
-
         Returns:
-            If `full_output=True`, dictionary containing S/N, power
+                        If `full_output=True`, dictionary containing S/N, power
             spectra and noise spectra. Otherwise just S/N.
         """
         sl = self._get_cl_resamples(weights)
@@ -159,10 +157,10 @@ def get_sn_wn(self, weights, n_dens, full_output=False):
         icl = np.linalg.inv(cl)
         sn2_1pt = np.sum(sl[:, :, :, None] * icl[:, None, :, :], axis=2)
         sn2_ell = np.sum(sn2_1pt[:, :, :, None] * sn2_1pt[:, None, :, :],
-                         axis=2)
+                axis=2)
         trsn2_ell = np.trace(sn2_ell, axis1=1, axis2=2)
         snr = np.sqrt(np.sum(trsn2_ell * (2*self.larr + 1) *
-                             self.d_ell / self.fsky))
+            self.d_ell / self.fsky))
 
         if full_output:
             dret = {'snr': snr, 'cl': sl, 'nl': nl, 'ls': self.larr}
@@ -172,40 +170,36 @@ def get_sn_wn(self, weights, n_dens, full_output=False):
 
     def get_sn(self, assign, full_output=False):
         """ Compute signal-to-noise ratio from a bin assignment list.
-
         Args:
-            assign (list): list of arrays. Each element of the list should
+                        assign (list): list of arrays. Each element of the list should
                 be a numpy array of integers, corresponding to the indices
                 of the initial groups that are now resampled into a larger
                 group.
             full_output (bool): if true, a dictionary with additional
                 information will be returned. Otherwise just total S/N.
-
         Returns:
-            If `full_output=True`, dictionary containing S/N, power
+                        If `full_output=True`, dictionary containing S/N, power
             spectra and noise spectra. Otherwise just S/N.
         """
         w, ndens = self.get_resample_metadata(assign)
         return self.get_sn_wn(w, ndens,
-                              full_output=full_output)
+                full_output=full_output)
 
     def check_edges(self, edges):
         """ Returns `True` if there's something wrong with the edges.
         """
         return np.any(edges < 0) or \
-            np.any(edges > self.edges_large) or \
-            np.any(np.diff(edges) < 0)
+                np.any(edges > self.edges_large) or \
+                np.any(np.diff(edges) < 0)
 
     def assign_from_edges(self, edges, assign_params=assign_params_default):
         """ Get assignment list from edges and assign parameters.
-
         Args:
-            edges (array_like): array of bin edges.
+                        edges (array_like): array of bin edges.
             assign_params (dict): dictionary of assignment parameters
                 (see `assign_params_default`).
-
         Returns:
-            List of assignment arrays ready to be used in e.g. `get_sn`.
+                        List of assignment arrays ready to be used in e.g. `get_sn`.
         """
         nbins = len(edges) + 1
         # Bin IDs based on mean z
@@ -217,40 +211,38 @@ def assign_from_edges(self, edges, assign_params=assign_params_default):
             if assign_params['use_p_outbin']:
                 # Matrix of probabilities in each bin
                 pzs = np.array([[simps(pz[m], x=self.z_arr[m])
-                                 for m in masks]
-                                for pz in self.pz_list])
+                    for m in masks]
+                    for pz in self.pz_list])
                 pzd = np.array([pzs[j, ids[j]]
-                                for j in range(self.n_samples)])
+                    for j in range(self.n_samples)])
             else:
                 # Overlaps in own bin
                 pzd = np.array([simps(pz[masks[i]],
-                                      x=self.z_arr[masks[i]])
-                                for pz, i in zip(self.pz_list, ids)])
+                    x=self.z_arr[masks[i]])
+                    for pz, i in zip(self.pz_list, ids)])
 
-            # Throw away based on in-bin probability
+                # Throw away based on in-bin probability
             if assign_params['use_p_inbin']:
                 ids[pzd < assign_params['p_inbin_thr']] = -1
             # Throw away based on off-bin probability
             if assign_params['use_p_outbin']:
                 ids[np.array([np.sum(p > assign_params['p_outbin_thr']) > 1
-                              for p in pzs])] = -1
-        # Assignment list
+                    for p in pzs])] = -1
+                # Assignment list
         return [np.where(ids == i)[0] for i in np.unique(ids)]
 
     def get_sn_from_edges(self, edges, full_output=False,
-                          assign_params=assign_params_default):
+            assign_params=assign_params_default):
         """ Compute signal-to-noise ratio from a set of edges and assignment
         parameters.
-
         Args:
-            edges (array_like): array of bin edges.
+                        edges (array_like): array of bin edges.
             assign_params (dict): dictionary of assignment parameters
                 (see `assign_params_default`).
             full_output (bool): if true, a dictionary with additional
                 information will be returned. Otherwise just total S/N.
-
         Returns:
-            If `full_output=True`, dictionary containing S/N, power
+                        If `full_output=True`, dictionary containing S/N, power
             spectra and noise spectra. Otherwise just S/N.
         """
         if self.check_edges(edges):

From 0223629455a150435c03ea7c0fd00be609f584ff Mon Sep 17 00:00:00 2001
From: Bela Abolfathi <babolfat@uci.edu>
Date: Fri, 28 Aug 2020 15:51:03 -0700
Subject: [PATCH 06/35] Add updated complex classifier

---
 tomo_challenge/classifiers/ComplexSOM.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py
index 501617e8..c122a928 100644
--- a/tomo_challenge/classifiers/ComplexSOM.py
+++ b/tomo_challenge/classifiers/ComplexSOM.py
@@ -12,6 +12,7 @@
 See Classifier Documentation below.
 """
 
+import os 
 from .base import Tomographer
 import numpy as np
 import rpy2
@@ -40,12 +41,12 @@
     utils.install_packages(StrVector(names_to_install))
 base.Sys_setenv(TAR=base.system("which tar",intern=True))
 devtools=ro.packages.importr("devtools")
-devtools.install_github("AngusWright/helpRfuncs")
-devtools.install_github("AngusWright/kohonen/kohonen")
+#devtools.install_github("AngusWright/helpRfuncs")
+#devtools.install_github("AngusWright/kohonen/kohonen")
 kohonen=ro.packages.importr("kohonen")
 
 class ComplexSOM(Tomographer):
-    """ Simplistic SOM Classifier """
+    """ Complex SOM Classifier with SNR Optimisation """
     
     # valid parameter -- see below
     valid_options = ['bins','som_dim','num_groups','num_threads',
@@ -258,16 +259,23 @@ def train (self, training_data, training_z):
         #Get the group assignments for the training data 
         train_group = np.array(train_som.rx2['clust.classif'])
         tab=base.table(FactorVector(train_som.rx2['clust.classif'],levels=base.seq(num_groups)))
+        print(tab)
+        print(np.arange(num_groups)[np.array(tab)==0])
         #Construct the Nz z-axis
-        z_arr = np.linspace(0, 2, 1024)
+        z_arr = np.linspace(0, 2, 124)
         z_cen = z_arr[0:-1]+(z_arr[1]-z_arr[0])/2. 
         #Construct the per-group Nz
         nzs = [(np.histogram(training_z[train_group == group+1],z_arr)[0]) for group in np.arange(num_groups)[np.array(tab)!=0]]
+        #Update the fsky 
+        n_gal = np.sum(np.array(tab)) # Total number of galaxies you have, change appropriately
+        n_eff = 20. # Target density in arcmin^-2
+        fsky = n_gal / (4*np.pi*(180 * 60 / np.pi)**2 * n_eff)
         # Now initialize a S/N calculator for these initial groups.
+        os.system('rm wl_nb%d.npz' % n_bin)
         if metric == 'SNR_ww': 
-            c_wl = SnCalc(z_cen, nzs, use_clustering=False)
+            c_wl = SnCalc(z_cen, nzs, use_clustering=False,fsky=fsky)
         else:
-            c_wl = SnCalc(z_cen, nzs, use_clustering=True)
+            c_wl = SnCalc(z_cen, nzs, use_clustering=True,fsky=fsky)
         print("Initializing WL")
         c_wl.get_cl_matrix(fname_save='wl_nb%d.npz' % n_bin)
         # Finally, let's write the function to minimize and optimize for a 4-bin case.
@@ -278,8 +286,9 @@ def minus_sn(edges, calc):
         edges_0 = np.linspace(0, 2, n_bin-1)
         res = minimize(minus_sn, edges_0, method='Powell', args=(c_wl,))
         print("WL final edges: ", res.x)
-        print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x))
+        print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x)*np.sqrt(0.25/fsky))
         print(" ")
+        print(res)
 
 
 

From a6b069d3dd97f2651911d9920dc0211b2fa0cd54 Mon Sep 17 00:00:00 2001
From: Andrina Nicola <aaxnicola@gmail.com>
Date: Sun, 30 Aug 2020 11:06:21 -0700
Subject: [PATCH 07/35] Improved interface between snc and apply.

---
 the_atlantics/snc.py                     |   7 +-
 tomo_challenge/classifiers/ComplexSOM.py |  30 ++-
 tomo_challenge/classifiers/SimpleSOM.py  |   4 +-
 tomo_challenge/snc.py                    | 254 +++++++++++++++++++++++
 4 files changed, 273 insertions(+), 22 deletions(-)
 create mode 100644 tomo_challenge/snc.py

diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py
index 27aa0c09..0574acee 100644
--- a/the_atlantics/snc.py
+++ b/the_atlantics/snc.py
@@ -192,7 +192,7 @@ def check_edges(self, edges):
                 np.any(edges > self.edges_large) or \
                 np.any(np.diff(edges) < 0)
 
-    def assign_from_edges(self, edges, assign_params=assign_params_default):
+    def assign_from_edges(self, edges, assign_params=assign_params_default, return_bins=False):
         """ Get assignment list from edges and assign parameters.
         Args:
                         edges (array_like): array of bin edges.
@@ -229,7 +229,10 @@ def assign_from_edges(self, edges, assign_params=assign_params_default):
                 ids[np.array([np.sum(p > assign_params['p_outbin_thr']) > 1
                     for p in pzs])] = -1
                 # Assignment list
-        return [np.where(ids == i)[0] for i in np.unique(ids)]
+        if return_bins:
+            return [np.where(ids == i)[0] for i in np.unique(ids)], np.unique(ids)
+        else:
+            return [np.where(ids == i)[0] for i in np.unique(ids)]
 
     def get_sn_from_edges(self, edges, full_output=False,
             assign_params=assign_params_default):
diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py
index c122a928..37211d73 100644
--- a/tomo_challenge/classifiers/ComplexSOM.py
+++ b/tomo_challenge/classifiers/ComplexSOM.py
@@ -266,6 +266,9 @@ def train (self, training_data, training_z):
         z_cen = z_arr[0:-1]+(z_arr[1]-z_arr[0])/2. 
         #Construct the per-group Nz
         nzs = [(np.histogram(training_z[train_group == group+1],z_arr)[0]) for group in np.arange(num_groups)[np.array(tab)!=0]]
+       
+                # np.save('nzs.npy', nzs)
+       
         #Update the fsky 
         n_gal = np.sum(np.array(tab)) # Total number of galaxies you have, change appropriately
         n_eff = 20. # Target density in arcmin^-2
@@ -290,15 +293,20 @@ def minus_sn(edges, calc):
         print(" ")
         print(res)
 
-
+        groups_in_tomo, tomo_bins = c_wl.assign_from_edges(res.x, return_bins=True)
+        group_bins = np.zeros(num_groups)
+        for i, bin_no in enumerate(tomo_bins):
+            print(i)
+            print(bin_no)
+            group_bins[groups_in_tomo[i]] = bin_no
+        print(group_bins)
 
         #Construct the per-group Nz
         print("Outputting trained SOM and redshift ordering of SOM groups")
 
         print("Finished")
         self.train_som = train_som
-        self.group_z = group_z
-        self.z_order = z_order
+        self.group_bins = group_bins
 
     def apply (self, data):
         """Applies training to the data.
@@ -333,26 +341,12 @@ def apply (self, data):
             expression="nrow(data)",expr_label="N",
             n_cluster_bins=self.opt['num_groups'],n_cores=self.opt['num_threads'])
 
-        #Calculate the cumulative count 
-        print("Generate cumulative source counts a.f.o. group mean z")
-        zcumsum=base.cumsum(group_prop.rx2('property').rx(self.z_order))
-
-        # Find the edges that split the redshifts into n_z bins of
-        # equal number counts in each
-        print("Assign the groups to tomographic bins")
-        p = np.linspace(0, 100, n_bin + 1)
-        n_edges = np.percentile(zcumsum, p)
-
-        # Now put the groups into redshift bins.
-        group_bins = FloatVector(base.cut(FloatVector(zcumsum),FloatVector(n_edges),
-            include=True)).rx(base.order(self.z_order))
-
         #extract the validation som (just for convenience)
         valid_som = group_prop.rx2('som')
 
         #Assign the sources, by group, to tomographic bins
         print("Output source tomographic bin assignments")
-        valid_bin = base.unsplit(group_bins,FactorVector(valid_som.rx2['clust.classif'],
+        valid_bin = base.unsplit(self.group_bins,FactorVector(valid_som.rx2['clust.classif'],
             levels=base.seq(self.opt['num_groups'])),drop=False)
         #valid_bin = valid_som.rx2['clust.classif']
         valid_bin = np.array(valid_bin)-1
diff --git a/tomo_challenge/classifiers/SimpleSOM.py b/tomo_challenge/classifiers/SimpleSOM.py
index f3b0b74c..43d7ed98 100644
--- a/tomo_challenge/classifiers/SimpleSOM.py
+++ b/tomo_challenge/classifiers/SimpleSOM.py
@@ -37,8 +37,8 @@
     utils.install_packages(StrVector(names_to_install))
 base.Sys_setenv(TAR=base.system("which tar",intern=True))
 devtools=ro.packages.importr("devtools")
-devtools.install_github("AngusWright/helpRfuncs")
-devtools.install_github("AngusWright/kohonen/kohonen")
+#devtools.install_github("AngusWright/helpRfuncs")
+#devtools.install_github("AngusWright/kohonen/kohonen")
 kohonen=ro.packages.importr("kohonen")
 
 class SimpleSOM(Tomographer):
diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py
new file mode 100644
index 00000000..0574acee
--- /dev/null
+++ b/tomo_challenge/snc.py
@@ -0,0 +1,254 @@
+import numpy as np
+import pyccl as ccl
+from scipy.integrate import simps
+import os
+
+
+assign_params_default = {'p_inbin_thr': 0.5,
+        'p_outbin_thr': 0.2,
+        'use_p_inbin': False,
+        'use_p_outbin': False}
+
+
+class SnCalc(object):
+    edges_large = 100.
+
+    def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
+            s_gamma=0.28, use_clustering=False):
+        """ S/N calculator
+        Args:
+                        z_arr (array_like): array of redshifts at which all the N(z)s are
+                sampled. They should be linearly spaced.
+            nz_list (list): list of arrays containing the redshift
+                distributions of all the initial groups (each array in the
+                list corresponds to one group). The arrays should contain the
+                redshift distribution sampled at `z_arr`, with its integral
+                over `z_arr` equal to the number of galaxies in that group.
+            fsky (float): sky fraction (for SNR calculation and for
+                transforming number of galaxies into number densities).
+            lmax (float): maximum ell.
+            d_ell (float): ell bandpower width.
+            s_gamma (float): rms ellipticity scatter (relevant if
+                `use_clustering=False`).
+            use_clustering (bool): if `True`, SNR will be computed for
+                clustering instead of lensing.
+        """
+        self.s_gamma = s_gamma
+        self.fsky = fsky
+        self.lmax = lmax
+        self.d_ell = d_ell
+        self.larr = np.arange(2, lmax, d_ell)+d_ell/2
+        self.n_ell = len(self.larr)
+        self.n_samples = len(nz_list)
+        self.z_arr = z_arr
+        self.nz_list = nz_list
+        self.n_gals = np.array([simps(nz, x=self.z_arr)
+            for nz in self.nz_list])
+        self.pz_list = self.nz_list / self.n_gals[:, None]
+        self.z_means = np.array([simps(self.z_arr*nz, x=self.z_arr)
+            for nz in self.nz_list]) / self.n_gals
+        self.n_dens = self.n_gals / (4*np.pi*self.fsky)
+        self.cls = None
+        self.use_clustering = use_clustering
+
+    def _bz_model(self, cosmo, z):
+        return 0.95/ccl.growth_factor(cosmo, 1./(1+z))
+
+    def get_cl_matrix(self, fname_save=None, recompute=False):
+        """ Computes matrix of power spectra between all the initial groups.
+        Args:
+                        fname_save (string): if not None, the result will be saved to a
+                file by this name, and if present and `recompute=False`, the
+                matrix will be read from that file.
+            recompute (bool): if `True`, any previous calculations of the
+                C_ell matrix stored on file will be ignored, and the matrix
+                will be recomputed.
+        """
+        if fname_save is not None:
+            if os.path.isfile(fname_save) and not recompute:
+                self.cls = np.load(fname_save)['cls']
+                return
+
+        # Cosmology
+        cosmo = ccl.Cosmology(Omega_c=0.25,
+                Omega_b=0.05,
+                h=0.67, n_s=0.96,
+                sigma8=0.81)
+
+        # Tracers
+        if self.use_clustering:
+            trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz),
+                bias=(self.z_arr,
+                    self._bz_model(cosmo,
+                        self.z_arr)))
+                    for nz in self.nz_list]
+        else:
+            trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz))
+                    for nz in self.nz_list]
+
+            # Cls
+        self.cls = np.zeros([self.n_ell, self.n_samples, self.n_samples])
+        for i in range(self.n_samples):
+            for j in range(i, self.n_samples):
+                cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr)
+                self.cls[:, i, j] = cl
+                if j != i:
+                    self.cls[:, j, i] = cl
+
+        if fname_save is not None:
+            np.savez(fname_save, cls=self.cls)
+
+    def get_resample_metadata(self, assign):
+        """ Transform an assignment list into a weight matrix and the
+        corresponding number densities.
+        Args:
+                        assign (list): list of arrays. Each element of the list should
+                be a numpy array of integers, corresponding to the indices
+                of the initial groups that are now resampled into a larger
+                group.
+        """
+        n_resamples = len(assign)
+        weight_res = np.zeros([n_resamples, self.n_samples])
+        n_dens_res = np.zeros(n_resamples)
+        for ia, a in enumerate(assign):
+            ndens = self.n_dens[a]
+            ndens_tot = np.sum(ndens)
+            n_dens_res[ia] = ndens_tot
+            weight_res[ia, a] = ndens / ndens_tot
+        return weight_res, n_dens_res
+
+    def _get_cl_resamples(self, weights):
+        """ Gets C_ell matrix for resampled groups from weights matrix.
+        """
+        if self.cls is None:
+            self.get_cl_matrix()
+        return np.einsum('jl,km,ilm', weights, weights, self.cls)
+
+    def _get_nl_resamples(self, n_dens):
+        """ Gets noise contribution to C_ell matrix for resampled groups
+        from list of number densities.
+        """
+        if self.use_clustering:
+            return np.diag(1./n_dens)[None, :, :]
+        else:
+            return np.diag(self.s_gamma**2/n_dens)[None, :, :]
+
+    def get_sn_wn(self, weights, n_dens, full_output=False):
+        """ Compute signal-to-noise ratio from weights matrix and number
+        densities of the resampled groups.
+        Args:
+                        weights (array_like): weights matrix of shape
+                `(N_resample, N_initial)`, where `N_resample` is the
+                number of new groups, and `N_initial` is the original
+                number of groups. Each entry corresponds to the weight
+                which with a given initial group enters the new set of
+                groups.
+            n_dens (array_like): number density (in sterad^-1) of the
+                new groups.
+            full_output (bool): if true, a dictionary with additional
+                information will be returned. Otherwise just total S/N.
+        Returns:
+                        If `full_output=True`, dictionary containing S/N, power
+            spectra and noise spectra. Otherwise just S/N.
+        """
+        sl = self._get_cl_resamples(weights)
+        nl = self._get_nl_resamples(n_dens)
+        cl = sl + nl
+        icl = np.linalg.inv(cl)
+        sn2_1pt = np.sum(sl[:, :, :, None] * icl[:, None, :, :], axis=2)
+        sn2_ell = np.sum(sn2_1pt[:, :, :, None] * sn2_1pt[:, None, :, :],
+                axis=2)
+        trsn2_ell = np.trace(sn2_ell, axis1=1, axis2=2)
+        snr = np.sqrt(np.sum(trsn2_ell * (2*self.larr + 1) *
+            self.d_ell / self.fsky))
+
+        if full_output:
+            dret = {'snr': snr, 'cl': sl, 'nl': nl, 'ls': self.larr}
+            return dret
+        else:
+            return snr
+
+    def get_sn(self, assign, full_output=False):
+        """ Compute signal-to-noise ratio from a bin assignment list.
+        Args:
+                        assign (list): list of arrays. Each element of the list should
+                be a numpy array of integers, corresponding to the indices
+                of the initial groups that are now resampled into a larger
+                group.
+            full_output (bool): if true, a dictionary with additional
+                information will be returned. Otherwise just total S/N.
+        Returns:
+                        If `full_output=True`, dictionary containing S/N, power
+            spectra and noise spectra. Otherwise just S/N.
+        """
+        w, ndens = self.get_resample_metadata(assign)
+        return self.get_sn_wn(w, ndens,
+                full_output=full_output)
+
+    def check_edges(self, edges):
+        """ Returns `True` if there's something wrong with the edges.
+        """
+        return np.any(edges < 0) or \
+                np.any(edges > self.edges_large) or \
+                np.any(np.diff(edges) < 0)
+
+    def assign_from_edges(self, edges, assign_params=assign_params_default, return_bins=False):
+        """ Get assignment list from edges and assign parameters.
+        Args:
+                        edges (array_like): array of bin edges.
+            assign_params (dict): dictionary of assignment parameters
+                (see `assign_params_default`).
+        Returns:
+                        List of assignment arrays ready to be used in e.g. `get_sn`.
+        """
+        nbins = len(edges) + 1
+        # Bin IDs based on mean z
+        ids = np.digitize(self.z_means, bins=edges)
+        if assign_params['use_p_inbin'] or assign_params['use_p_outbin']:
+            # Calculate probabilities in each bin
+            zgroups = np.digitize(self.z_arr, bins=edges)
+            masks = [zgroups == i for i in range(nbins)]
+            if assign_params['use_p_outbin']:
+                # Matrix of probabilities in each bin
+                pzs = np.array([[simps(pz[m], x=self.z_arr[m])
+                    for m in masks]
+                    for pz in self.pz_list])
+                pzd = np.array([pzs[j, ids[j]]
+                    for j in range(self.n_samples)])
+            else:
+                # Overlaps in own bin
+                pzd = np.array([simps(pz[masks[i]],
+                    x=self.z_arr[masks[i]])
+                    for pz, i in zip(self.pz_list, ids)])
+
+                # Throw away based on in-bin probability
+            if assign_params['use_p_inbin']:
+                ids[pzd < assign_params['p_inbin_thr']] = -1
+            # Throw away based on off-bin probability
+            if assign_params['use_p_outbin']:
+                ids[np.array([np.sum(p > assign_params['p_outbin_thr']) > 1
+                    for p in pzs])] = -1
+                # Assignment list
+        if return_bins:
+            return [np.where(ids == i)[0] for i in np.unique(ids)], np.unique(ids)
+        else:
+            return [np.where(ids == i)[0] for i in np.unique(ids)]
+
+    def get_sn_from_edges(self, edges, full_output=False,
+            assign_params=assign_params_default):
+        """ Compute signal-to-noise ratio from a set of edges and assignment
+        parameters.
+        Args:
+                        edges (array_like): array of bin edges.
+            assign_params (dict): dictionary of assignment parameters
+                (see `assign_params_default`).
+            full_output (bool): if true, a dictionary with additional
+                information will be returned. Otherwise just total S/N.
+        Returns:
+                        If `full_output=True`, dictionary containing S/N, power
+            spectra and noise spectra. Otherwise just S/N.
+        """
+        if self.check_edges(edges):
+            return 0
+        assign = self.assign_from_edges(edges, assign_params=assign_params)
+        return self.get_sn(assign, full_output=full_output)

From 7ed22282b060e2688d14d90dabb737e380e27bf9 Mon Sep 17 00:00:00 2001
From: David Alonso <dam.phys@gmail.com>
Date: Sun, 30 Aug 2020 19:17:53 +0100
Subject: [PATCH 08/35] optimizer with all options

---
 the_atlantics/optimizer.py               | 111 +++++++++++++++++++++++
 the_atlantics/snc.py                     |  49 ++++++----
 tomo_challenge/classifiers/ComplexSOM.py |   7 +-
 tomo_challenge/snc.py                    |  49 ++++++----
 4 files changed, 176 insertions(+), 40 deletions(-)
 create mode 100644 the_atlantics/optimizer.py

diff --git a/the_atlantics/optimizer.py b/the_atlantics/optimizer.py
new file mode 100644
index 00000000..9369a28e
--- /dev/null
+++ b/the_atlantics/optimizer.py
@@ -0,0 +1,111 @@
+from argparse import ArgumentParser
+import numpy as np
+from snc import SnCalc, assign_params_default
+from scipy.interpolate import interp1d
+from scipy.optimize import minimize, brentq
+from scipy.integrate import simps
+import matplotlib.pyplot as plt
+
+
+parser = ArgumentParser()
+parser.add_argument("--nz-file", default="nzs.npy", type=str,
+                    help="Path to input file containing the N(z) of all initial groups")
+parser.add_argument("--prefix-out", default='out', type=str,
+                    help="Prefix for all output files")
+parser.add_argument("--n-bins", default=4, type=int,
+                    help="Number of bins (excluding a possible trash bin")
+parser.add_argument("--prob-in-bin", default=-1., type=float,
+                    help="Minimum fraction of the N(z) that should be inside the bin."
+                    "If <= 0, this will not be taken into account in the optimization.")
+parser.add_argument("--prob-out-bin", default=-1., type=float,
+                    help="Maximum fraction of the N(z) that should be inside other bins."
+                    "If <= 0, this will not be taken into account in the optimization.")
+o = parser.parse_args()
+
+nzs = np.load(o.nz_file)
+# We should probably read z on input too
+z_arr = np.linspace(0, 2, 124)
+zz = (z_arr[1:]+z_arr[:-1])*0.5
+# Renormalize to use the same sample definitions as the
+# official metric calculators.
+fsky = 0.25
+ndens_arcmin = 25.
+area_arcmin = (180*60/np.pi)**2*4*np.pi*fsky
+ng_tot = ndens_arcmin * area_arcmin
+ng_each_small = simps(nzs, x=zz)
+ng_tot_small = np.sum(ng_each_small)
+ng_each = ng_each_small * ng_tot / ng_tot_small
+nzs = nzs * ng_tot / ng_tot_small
+
+# Initialize calculator
+sc = SnCalc(zz, nzs, fsky=fsky)
+sc.get_cl_matrix(fname_save=o.prefix_out + 'cl_wl.npz')
+
+# Initial edge guess (defined as having equal number of galaxies)
+nz_tot = np.sum(nzs, axis=0)
+cumulative_fraction = np.cumsum(nz_tot) / np.sum(nz_tot)
+cumul_f = interp1d(zz, cumulative_fraction, bounds_error=False,
+                   fill_value=(0, 1))
+edges_0 = np.array([brentq(lambda z : cumul_f(z) - q, 0, 2)
+                    for q in (np.arange(o.n_bins-1)+1.)/o.n_bins])
+
+
+# Minimize
+# Binning parameters
+params = assign_params_default.copy()
+if o.prob_in_bin > 0:
+    params['p_inbin_thr'] = o.prob_in_bin
+    params['use_p_inbin'] = True
+if o.prob_out_bin > 0:
+    params['p_outbin_thr'] = o.prob_out_bin
+    params['use_p_outbin'] = True
+
+
+def minus_sn(edges, calc):
+    return -calc.get_sn_from_edges(edges, assign_params=params)
+
+    
+# Actual optimization
+res = minimize(minus_sn, edges_0, method='Powell', args=(sc,))
+edges_1 = res.x
+
+# Post-process:
+# S/N
+sn = sc.get_sn_from_edges(edges_1, assign_params=params)
+# N(z)s of the final bins
+nz_best = sc.get_nzs_from_edges(edges_1, assign_params=params)
+# Group assignment
+assign = sc.assign_from_edges(edges_1, assign_params=params,
+                              get_ids=True)
+
+
+def get_bin_name(i):
+    if i == -1:
+        return 'bin_trash'
+    else:
+        return 'bin_%d' % i
+
+print("Redshift edges: ", edges_1)
+print("Optimal S/N: ", sn)
+n_tot = simps(np.sum(nz_best, axis=0),
+              x=zz)
+
+d_out = {'sn': sn,
+         'edges': edges_1}
+plt.figure()
+for (i, a), n in zip(assign, nz_best):
+    name = get_bin_name(i)
+    d_out[name+'_groups'] = a
+    d_out[name+'_nz'] = n
+    print("Bin %d, groups: " % i, a)
+    n_here = simps(n, x=zz)
+    f = n_here / n_tot
+    frac = "%.1lf" % (100 * f)
+    plt.plot(zz, n/n_here,
+             label=f'Bin {i} ({frac} %)')
+plt.legend()
+plt.xlabel(r'$z$', fontsize=15)
+plt.ylabel(r'$p(z)$', fontsize=15)
+plt.show()
+
+np.savez(o.prefix_out + '_bin_info.npz', **d_out)
diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py
index 0574acee..f009115f 100644
--- a/the_atlantics/snc.py
+++ b/the_atlantics/snc.py
@@ -17,7 +17,7 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
             s_gamma=0.28, use_clustering=False):
         """ S/N calculator
         Args:
-                        z_arr (array_like): array of redshifts at which all the N(z)s are
+            z_arr (array_like): array of redshifts at which all the N(z)s are
                 sampled. They should be linearly spaced.
             nz_list (list): list of arrays containing the redshift
                 distributions of all the initial groups (each array in the
@@ -43,10 +43,10 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
         self.z_arr = z_arr
         self.nz_list = nz_list
         self.n_gals = np.array([simps(nz, x=self.z_arr)
-            for nz in self.nz_list])
+                                for nz in self.nz_list])
         self.pz_list = self.nz_list / self.n_gals[:, None]
         self.z_means = np.array([simps(self.z_arr*nz, x=self.z_arr)
-            for nz in self.nz_list]) / self.n_gals
+                                 for nz in self.nz_list]) / self.n_gals
         self.n_dens = self.n_gals / (4*np.pi*self.fsky)
         self.cls = None
         self.use_clustering = use_clustering
@@ -102,7 +102,7 @@ def get_resample_metadata(self, assign):
         """ Transform an assignment list into a weight matrix and the
         corresponding number densities.
         Args:
-                        assign (list): list of arrays. Each element of the list should
+            assign (list): list of arrays. Each element of the list should
                 be a numpy array of integers, corresponding to the indices
                 of the initial groups that are now resampled into a larger
                 group.
@@ -192,14 +192,24 @@ def check_edges(self, edges):
                 np.any(edges > self.edges_large) or \
                 np.any(np.diff(edges) < 0)
 
-    def assign_from_edges(self, edges, assign_params=assign_params_default, return_bins=False):
+    def get_nzs_from_edges(self, edges, assign_params=assign_params_default):
+        assign = self.assign_from_edges(edges, assign_params=assign_params)
+        nzs = np.array([np.sum(self.nz_list[a, :], axis=0)
+                        for a in assign])
+        return nzs
+
+    def assign_from_edges(self, edges, assign_params=assign_params_default,
+                          get_ids=False):
         """ Get assignment list from edges and assign parameters.
         Args:
-                        edges (array_like): array of bin edges.
+            edges (array_like): array of bin edges.
             assign_params (dict): dictionary of assignment parameters
                 (see `assign_params_default`).
+            get_ids (bool): if `True`, output assignment arrays will
+                be accompanied by the associated bin id (with -1 for the
+                trash bin).
         Returns:
-                        List of assignment arrays ready to be used in e.g. `get_sn`.
+            List of assignment arrays ready to be used in e.g. `get_sn`.
         """
         nbins = len(edges) + 1
         # Bin IDs based on mean z
@@ -211,17 +221,20 @@ def assign_from_edges(self, edges, assign_params=assign_params_default, return_b
             if assign_params['use_p_outbin']:
                 # Matrix of probabilities in each bin
                 pzs = np.array([[simps(pz[m], x=self.z_arr[m])
-                    for m in masks]
-                    for pz in self.pz_list])
+                                 for m in masks]
+                                for pz in self.pz_list])
                 pzd = np.array([pzs[j, ids[j]]
-                    for j in range(self.n_samples)])
+                                for j in range(self.n_samples)])
             else:
                 # Overlaps in own bin
-                pzd = np.array([simps(pz[masks[i]],
-                    x=self.z_arr[masks[i]])
-                    for pz, i in zip(self.pz_list, ids)])
-
-                # Throw away based on in-bin probability
+                def integrate_safe(p, z, m):
+                    if np.sum(m) == 0:
+                        return 0
+                    else:
+                        return simps(p[m], x=z[m])
+                pzd = np.array([integrate_safe(pz, self.z_arr, masks[i])
+                                for pz, i in zip(self.pz_list, ids)])
+            # Throw away based on in-bin probability
             if assign_params['use_p_inbin']:
                 ids[pzd < assign_params['p_inbin_thr']] = -1
             # Throw away based on off-bin probability
@@ -229,13 +242,13 @@ def assign_from_edges(self, edges, assign_params=assign_params_default, return_b
                 ids[np.array([np.sum(p > assign_params['p_outbin_thr']) > 1
                     for p in pzs])] = -1
                 # Assignment list
-        if return_bins:
-            return [np.where(ids == i)[0] for i in np.unique(ids)], np.unique(ids)
+        if get_ids:
+            return [(i, np.where(ids == i)[0]) for i in np.unique(ids)]
         else:
             return [np.where(ids == i)[0] for i in np.unique(ids)]
 
     def get_sn_from_edges(self, edges, full_output=False,
-            assign_params=assign_params_default):
+                          assign_params=assign_params_default):
         """ Compute signal-to-noise ratio from a set of edges and assignment
         parameters.
         Args:
diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py
index 37211d73..5c0fbb42 100644
--- a/tomo_challenge/classifiers/ComplexSOM.py
+++ b/tomo_challenge/classifiers/ComplexSOM.py
@@ -293,12 +293,11 @@ def minus_sn(edges, calc):
         print(" ")
         print(res)
 
-        groups_in_tomo, tomo_bins = c_wl.assign_from_edges(res.x, return_bins=True)
+        groups_in_tomo = c_wl.assign_from_edges(res.x, get_ids=True)
         group_bins = np.zeros(num_groups)
-        for i, bin_no in enumerate(tomo_bins):
-            print(i)
+        for bin_no, groups in enumerate(groups_in_tomo):
             print(bin_no)
-            group_bins[groups_in_tomo[i]] = bin_no
+            group_bins[groups] = bin_no
         print(group_bins)
 
         #Construct the per-group Nz
diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py
index 0574acee..f009115f 100644
--- a/tomo_challenge/snc.py
+++ b/tomo_challenge/snc.py
@@ -17,7 +17,7 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
             s_gamma=0.28, use_clustering=False):
         """ S/N calculator
         Args:
-                        z_arr (array_like): array of redshifts at which all the N(z)s are
+            z_arr (array_like): array of redshifts at which all the N(z)s are
                 sampled. They should be linearly spaced.
             nz_list (list): list of arrays containing the redshift
                 distributions of all the initial groups (each array in the
@@ -43,10 +43,10 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
         self.z_arr = z_arr
         self.nz_list = nz_list
         self.n_gals = np.array([simps(nz, x=self.z_arr)
-            for nz in self.nz_list])
+                                for nz in self.nz_list])
         self.pz_list = self.nz_list / self.n_gals[:, None]
         self.z_means = np.array([simps(self.z_arr*nz, x=self.z_arr)
-            for nz in self.nz_list]) / self.n_gals
+                                 for nz in self.nz_list]) / self.n_gals
         self.n_dens = self.n_gals / (4*np.pi*self.fsky)
         self.cls = None
         self.use_clustering = use_clustering
@@ -102,7 +102,7 @@ def get_resample_metadata(self, assign):
         """ Transform an assignment list into a weight matrix and the
         corresponding number densities.
         Args:
-                        assign (list): list of arrays. Each element of the list should
+            assign (list): list of arrays. Each element of the list should
                 be a numpy array of integers, corresponding to the indices
                 of the initial groups that are now resampled into a larger
                 group.
@@ -192,14 +192,24 @@ def check_edges(self, edges):
                 np.any(edges > self.edges_large) or \
                 np.any(np.diff(edges) < 0)
 
-    def assign_from_edges(self, edges, assign_params=assign_params_default, return_bins=False):
+    def get_nzs_from_edges(self, edges, assign_params=assign_params_default):
+        assign = self.assign_from_edges(edges, assign_params=assign_params)
+        nzs = np.array([np.sum(self.nz_list[a, :], axis=0)
+                        for a in assign])
+        return nzs
+
+    def assign_from_edges(self, edges, assign_params=assign_params_default,
+                          get_ids=False):
         """ Get assignment list from edges and assign parameters.
         Args:
-                        edges (array_like): array of bin edges.
+            edges (array_like): array of bin edges.
             assign_params (dict): dictionary of assignment parameters
                 (see `assign_params_default`).
+            get_ids (bool): if `True`, output assignment arrays will
+                be accompanied by the associated bin id (with -1 for the
+                trash bin).
         Returns:
-                        List of assignment arrays ready to be used in e.g. `get_sn`.
+            List of assignment arrays ready to be used in e.g. `get_sn`.
         """
         nbins = len(edges) + 1
         # Bin IDs based on mean z
@@ -211,17 +221,20 @@ def assign_from_edges(self, edges, assign_params=assign_params_default, return_b
             if assign_params['use_p_outbin']:
                 # Matrix of probabilities in each bin
                 pzs = np.array([[simps(pz[m], x=self.z_arr[m])
-                    for m in masks]
-                    for pz in self.pz_list])
+                                 for m in masks]
+                                for pz in self.pz_list])
                 pzd = np.array([pzs[j, ids[j]]
-                    for j in range(self.n_samples)])
+                                for j in range(self.n_samples)])
             else:
                 # Overlaps in own bin
-                pzd = np.array([simps(pz[masks[i]],
-                    x=self.z_arr[masks[i]])
-                    for pz, i in zip(self.pz_list, ids)])
-
-                # Throw away based on in-bin probability
+                def integrate_safe(p, z, m):
+                    if np.sum(m) == 0:
+                        return 0
+                    else:
+                        return simps(p[m], x=z[m])
+                pzd = np.array([integrate_safe(pz, self.z_arr, masks[i])
+                                for pz, i in zip(self.pz_list, ids)])
+            # Throw away based on in-bin probability
             if assign_params['use_p_inbin']:
                 ids[pzd < assign_params['p_inbin_thr']] = -1
             # Throw away based on off-bin probability
@@ -229,13 +242,13 @@ def assign_from_edges(self, edges, assign_params=assign_params_default, return_b
                 ids[np.array([np.sum(p > assign_params['p_outbin_thr']) > 1
                     for p in pzs])] = -1
                 # Assignment list
-        if return_bins:
-            return [np.where(ids == i)[0] for i in np.unique(ids)], np.unique(ids)
+        if get_ids:
+            return [(i, np.where(ids == i)[0]) for i in np.unique(ids)]
         else:
             return [np.where(ids == i)[0] for i in np.unique(ids)]
 
     def get_sn_from_edges(self, edges, full_output=False,
-            assign_params=assign_params_default):
+                          assign_params=assign_params_default):
         """ Compute signal-to-noise ratio from a set of edges and assignment
         parameters.
         Args:

From 037a2b75419e95f897526a9b819366b3c0111982 Mon Sep 17 00:00:00 2001
From: David Alonso <dam.phys@gmail.com>
Date: Sun, 30 Aug 2020 19:20:58 +0100
Subject: [PATCH 09/35] 20

---
 the_atlantics/optimizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/the_atlantics/optimizer.py b/the_atlantics/optimizer.py
index 9369a28e..7fce42c4 100644
--- a/the_atlantics/optimizer.py
+++ b/the_atlantics/optimizer.py
@@ -29,7 +29,7 @@
 # Renormalize to use the same sample definitions as the
 # official metric calculators.
 fsky = 0.25
-ndens_arcmin = 25.
+ndens_arcmin = 20.
 area_arcmin = (180*60/np.pi)**2*4*np.pi*fsky
 ng_tot = ndens_arcmin * area_arcmin
 ng_each_small = simps(nzs, x=zz)

From b159dca35172ec9f8f8b34297f3c0553eae127cf Mon Sep 17 00:00:00 2001
From: David Alonso <dam.phys@gmail.com>
Date: Sun, 30 Aug 2020 19:23:26 +0100
Subject: [PATCH 10/35] 0.28

---
 the_atlantics/snc.py  | 2 +-
 tomo_challenge/snc.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py
index f009115f..1aca43c0 100644
--- a/the_atlantics/snc.py
+++ b/the_atlantics/snc.py
@@ -14,7 +14,7 @@ class SnCalc(object):
     edges_large = 100.
 
     def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
-            s_gamma=0.28, use_clustering=False):
+            s_gamma=0.26, use_clustering=False):
         """ S/N calculator
         Args:
             z_arr (array_like): array of redshifts at which all the N(z)s are
diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py
index f009115f..1aca43c0 100644
--- a/tomo_challenge/snc.py
+++ b/tomo_challenge/snc.py
@@ -14,7 +14,7 @@ class SnCalc(object):
     edges_large = 100.
 
     def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
-            s_gamma=0.28, use_clustering=False):
+            s_gamma=0.26, use_clustering=False):
         """ S/N calculator
         Args:
             z_arr (array_like): array of redshifts at which all the N(z)s are

From f313e58c86c204245bbf04766c20fb77627b5b9d Mon Sep 17 00:00:00 2001
From: Andrina Nicola <aaxnicola@gmail.com>
Date: Sun, 30 Aug 2020 12:21:45 -0700
Subject: [PATCH 11/35] Adapted fsky.

---
 tomo_challenge/classifiers/ComplexSOM.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py
index 5c0fbb42..f0779992 100644
--- a/tomo_challenge/classifiers/ComplexSOM.py
+++ b/tomo_challenge/classifiers/ComplexSOM.py
@@ -269,10 +269,16 @@ def train (self, training_data, training_z):
        
                 # np.save('nzs.npy', nzs)
        
-        #Update the fsky 
-        n_gal = np.sum(np.array(tab)) # Total number of galaxies you have, change appropriately
-        n_eff = 20. # Target density in arcmin^-2
-        fsky = n_gal / (4*np.pi*(180 * 60 / np.pi)**2 * n_eff)
+        #Update the fsky
+        # Renormalize to use the same sample definitions as the
+        # official metric calculators.
+        fsky = 0.25
+        ndens_arcmin = 20.
+        area_arcmin = (180*60/np.pi)**2*4*np.pi*fsky
+        ng_tot_goal = ndens_arcmin * area_arcmin
+        ng_tot_curr = np.sum(np.array(tab))
+        nzs = [nz * ng_tot_goal / ng_tot_curr for nz in nzs]
+    
         # Now initialize a S/N calculator for these initial groups.
         os.system('rm wl_nb%d.npz' % n_bin)
         if metric == 'SNR_ww': 
@@ -295,10 +301,9 @@ def minus_sn(edges, calc):
 
         groups_in_tomo = c_wl.assign_from_edges(res.x, get_ids=True)
         group_bins = np.zeros(num_groups)
-        for bin_no, groups in enumerate(groups_in_tomo):
-            print(bin_no)
+        for bin_no, groups in groups_in_tomo:
+            print(bin_no, groups)
             group_bins[groups] = bin_no
-        print(group_bins)
 
         #Construct the per-group Nz
         print("Outputting trained SOM and redshift ordering of SOM groups")

From 2fa9e3f9a06b9ec86cc28e2fb4c0b8655e8fce1f Mon Sep 17 00:00:00 2001
From: David Alonso <dam.phys@gmail.com>
Date: Mon, 31 Aug 2020 11:12:01 +0100
Subject: [PATCH 12/35] Max edge at z=3

---
 the_atlantics/snc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py
index 1aca43c0..9d262eb9 100644
--- a/the_atlantics/snc.py
+++ b/the_atlantics/snc.py
@@ -11,7 +11,7 @@
 
 
 class SnCalc(object):
-    edges_large = 100.
+    edges_large = 3.
 
     def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
             s_gamma=0.26, use_clustering=False):

From 6cd2cea50cc163b05ae7c10a6e698f1256ab7ef1 Mon Sep 17 00:00:00 2001
From: David Alonso <dam.phys@gmail.com>
Date: Mon, 31 Aug 2020 11:12:49 +0100
Subject: [PATCH 13/35] max edge at z=3

---
 tomo_challenge/snc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py
index 1aca43c0..9d262eb9 100644
--- a/tomo_challenge/snc.py
+++ b/tomo_challenge/snc.py
@@ -11,7 +11,7 @@
 
 
 class SnCalc(object):
-    edges_large = 100.
+    edges_large = 3.
 
     def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
             s_gamma=0.26, use_clustering=False):

From 69cd49bb5019776555aaba3e0fcf9b8efc0985f9 Mon Sep 17 00:00:00 2001
From: Angus Wright <awright@astro.ruhr-uni-bochum.de>
Date: Mon, 31 Aug 2020 13:34:05 +0200
Subject: [PATCH 14/35] added .gitignore

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..bc56b5a7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+plots/*
+

From b9decf24a4040b55828516efcc3b7a0fd754d26d Mon Sep 17 00:00:00 2001
From: Angus Wright <awright@astro.ruhr-uni-bochum.de>
Date: Mon, 31 Aug 2020 13:35:47 +0200
Subject: [PATCH 15/35] updated ComplexSOM code & yaml, improved grouping

---
 example/ComplexSOM.yaml                  |  6 +--
 tomo_challenge/classifiers/ComplexSOM.py | 50 ++++++++++++++++--------
 2 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/example/ComplexSOM.yaml b/example/ComplexSOM.yaml
index 63a97e3e..e19130f4 100644
--- a/example/ComplexSOM.yaml
+++ b/example/ComplexSOM.yaml
@@ -2,7 +2,7 @@ metrics:  [SNR_3x2]
 bands: riz
 training_file: data/training.hdf5
 validation_file: data/validation_small.hdf5
-output_file: example/example_output.txt
+output_file: example/ComplexSOM_output.txt
 # Backend implementing the metrics, either: "firecrown" (default), "jax-cosmo"
 metrics_impl: firecrown
 
@@ -12,12 +12,12 @@ run:
     run_3:
       # These settings are sent to the classifier
       bins: 5
-      som_dim: [21,21]
+      som_dim: [51,51]
       num_groups: 100
       num_threads: 32
       group_type: 'redshift'
       data_threshold: [0,30]
-      sparse_frac: 0.05
+      sparse_frac: 0.15
       plots: True 
       plot_dir: 'plots' 
       metric: "SNR_3x2"
diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py
index f0779992..ea37a806 100644
--- a/tomo_challenge/classifiers/ComplexSOM.py
+++ b/tomo_challenge/classifiers/ComplexSOM.py
@@ -120,9 +120,9 @@ def train (self, training_data, training_z):
             group_type = 'redshift'
 
         #Define the redshift summary statistics (used for making groups in the 'redshift' case
-        property_labels = ("mean_z_true","med_z_true","sd_z_true","mad_z_true","N","iqr_z_true")
+        property_labels = ("mean_z_true","med_z_true","sd_z_true","mad_z_true","iqr_z_true")
         property_expressions = ("mean(data$redshift_true)","median(data$redshift_true)","sd(data$redshift_true)",
-                                "mad(data$redshift_true)","nrow(data)",
+                                "mad(data$redshift_true)",
                                 "diff(quantile(data$redshift_true,probs=pnorm(c(-2,2))))")
         #Define the SOM variables
         if self.bands == 'riz':
@@ -203,7 +203,10 @@ def train (self, training_data, training_z):
                         expression=StrVector(property_expressions),expr_label=StrVector(property_labels))
             print("Constructing redshift-based hierarchical cluster tree")
             #Cluster the SOM cells into num_groups groups
-            hclust=stats.hclust(stats.dist(cell_prop.rx2['property']))
+            props = cell_prop.rx2['property']
+            props.rx[base.which(base.is_na(props))] = -1
+            print(base.summary(props))
+            hclust=stats.hclust(stats.dist(props))
             cell_group=stats.cutree(hclust,k=num_groups)
 
             #Assign the cell groups to the SOM structure
@@ -222,6 +225,7 @@ def train (self, training_data, training_z):
         if plots == True: 
             #Make the diagnostic plots
             props = group_prop.rx2('property')
+            cprops = cell_prop.rx2('property')
             print(base.colnames(props))
             print("Constructing SOM diagnostic figures")
             #Initialise the plot device
@@ -240,6 +244,15 @@ def train (self, training_data, training_z):
              
             #Paint by the redshift diagnostics:
             #mean redshift
+            gr.plot(train_som,property=cprops.rx(True,base.which(cprops.colnames.ro=='mean_z_true')),ncolors=1e3,zlog=False,
+                    type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Cell mean redshift',zlim=FloatVector([0,1.8]))
+            #redshift std
+            gr.plot(train_som,property=cprops.rx(True,base.which(cprops.colnames.ro=='sd_z_true')),ncolors=1e3,zlog=False,
+                    type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Cell redshift stdev',zlim=FloatVector([0,0.2]))
+            #2sigma redshift IQR
+            gr.plot(train_som,property=cprops.rx(True,base.which(cprops.colnames.ro=='iqr_z_true')),ncolors=1e3,zlog=False,
+                    type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Cell 2sigma IQR',zlim=FloatVector([0,0.4]))
+            #mean redshift
             gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='mean_z_true')),ncolors=1e3,zlog=False,
                     type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group mean redshift',zlim=FloatVector([0,1.8]))
             #redshift std
@@ -247,10 +260,10 @@ def train (self, training_data, training_z):
                     type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group redshift stdev',zlim=FloatVector([0,0.2]))
             #2sigma redshift IQR
             gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='iqr_z_true')),ncolors=1e3,zlog=False,
-                    type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='group 2sigma IQR',zlim=FloatVector([0,0.4]))
-            #N group
-            gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='N')),ncolors=1e3,zlog=False,
-                    type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='N group')
+                    type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group 2sigma IQR',zlim=FloatVector([0,0.4]))
+            ##N group
+            #gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='N')),ncolors=1e3,zlog=False,
+            #        type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='N group')
 
             #Close the plot device 
             dev.dev_off()
@@ -267,7 +280,7 @@ def train (self, training_data, training_z):
         #Construct the per-group Nz
         nzs = [(np.histogram(training_z[train_group == group+1],z_arr)[0]) for group in np.arange(num_groups)[np.array(tab)!=0]]
        
-                # np.save('nzs.npy', nzs)
+        #np.save('plots/nzs.npy', nzs)
        
         #Update the fsky
         # Renormalize to use the same sample definitions as the
@@ -278,6 +291,10 @@ def train (self, training_data, training_z):
         ng_tot_goal = ndens_arcmin * area_arcmin
         ng_tot_curr = np.sum(np.array(tab))
         nzs = [nz * ng_tot_goal / ng_tot_curr for nz in nzs]
+
+        #np.save('plots/nzs_norm.npy', nzs)
+       
+        #print(nzs)
     
         # Now initialize a S/N calculator for these initial groups.
         os.system('rm wl_nb%d.npz' % n_bin)
@@ -299,15 +316,14 @@ def minus_sn(edges, calc):
         print(" ")
         print(res)
 
+        #Construct the per-group Nz
+        print("Outputting trained SOM and redshift ordering of SOM groups")
         groups_in_tomo = c_wl.assign_from_edges(res.x, get_ids=True)
         group_bins = np.zeros(num_groups)
         for bin_no, groups in groups_in_tomo:
             print(bin_no, groups)
             group_bins[groups] = bin_no
 
-        #Construct the per-group Nz
-        print("Outputting trained SOM and redshift ordering of SOM groups")
-
         print("Finished")
         self.train_som = train_som
         self.group_bins = group_bins
@@ -329,6 +345,8 @@ def apply (self, data):
         
         #Number of tomographic bins 
         n_bin = self.opt['bins']
+        #Number of discrete SOM groups
+        num_groups = self.opt['num_groups'] 
 
         print("Preparing the data")
         data = pd.DataFrame.from_dict(data)
@@ -336,24 +354,22 @@ def apply (self, data):
         #Construct the validation data frame (just a python-to-R data conversion)
         print("Converting the data to R format")
         with localconverter(ro.default_converter + pandas2ri.converter):
-              #train_df = ro.conversion.py2rpy(train[['u_mag','g_mag','r_mag','i_mag','z_mag','y_mag']])
               data_df = ro.conversion.py2rpy(data)
 
         print("Parsing the validation data into the SOM groupings")
         #Generate the validation associations/groups
         group_prop=kohonen.generate_kohgroup_property(som=self.train_som,data=data_df,
             expression="nrow(data)",expr_label="N",
-            n_cluster_bins=self.opt['num_groups'],n_cores=self.opt['num_threads'])
+            n_cluster_bins=num_groups,n_cores=self.opt['num_threads'])
 
         #extract the validation som (just for convenience)
         valid_som = group_prop.rx2('som')
 
         #Assign the sources, by group, to tomographic bins
         print("Output source tomographic bin assignments")
-        valid_bin = base.unsplit(self.group_bins,FactorVector(valid_som.rx2['clust.classif'],
-            levels=base.seq(self.opt['num_groups'])),drop=False)
-        #valid_bin = valid_som.rx2['clust.classif']
-        valid_bin = np.array(valid_bin)-1
+        valid_bin = base.unsplit(IntVector(self.group_bins),FactorVector(valid_som.rx2['clust.classif'],
+            levels=base.seq(num_groups)),drop=False)
+        valid_bin = np.array(valid_bin)
 
         return valid_bin
 

From 10eef4688d4a7eecb79fa6a75f0f916c2148b980 Mon Sep 17 00:00:00 2001
From: David Alonso <dam.phys@gmail.com>
Date: Mon, 31 Aug 2020 21:53:38 +0100
Subject: [PATCH 16/35] spline integrator

---
 the_atlantics/optimizer.py | 10 ++++++++--
 the_atlantics/snc.py       |  8 ++++++--
 tomo_challenge/snc.py      |  8 ++++++--
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/the_atlantics/optimizer.py b/the_atlantics/optimizer.py
index 7fce42c4..d64bedc4 100644
--- a/the_atlantics/optimizer.py
+++ b/the_atlantics/optimizer.py
@@ -101,11 +101,17 @@ def get_bin_name(i):
     n_here = simps(n, x=zz)
     f = n_here / n_tot
     frac = "%.1lf" % (100 * f)
+    if i == -1:
+        bin_name = 'Trash bin'
+    else:
+        bin_name = f'Bin {i}'
     plt.plot(zz, n/n_here,
-             label=f'Bin {i} ({frac} %)')
+             label=bin_name + f' ({frac} %)')
 plt.legend()
 plt.xlabel(r'$z$', fontsize=15)
 plt.ylabel(r'$p(z)$', fontsize=15)
-plt.show()
+plt.savefig(o.prefix_out + '_nz_summary.png',
+            bbox_inches='tight')
 
 np.savez(o.prefix_out + '_bin_info.npz', **d_out)
+plt.show()
diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py
index 1aca43c0..0710ab3f 100644
--- a/the_atlantics/snc.py
+++ b/the_atlantics/snc.py
@@ -14,7 +14,7 @@ class SnCalc(object):
     edges_large = 100.
 
     def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
-            s_gamma=0.26, use_clustering=False):
+                 s_gamma=0.26, use_clustering=False, integrator='spline'):
         """ S/N calculator
         Args:
             z_arr (array_like): array of redshifts at which all the N(z)s are
@@ -32,7 +32,10 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
                 `use_clustering=False`).
             use_clustering (bool): if `True`, SNR will be computed for
                 clustering instead of lensing.
+            integrator (string): CCL integration method. Either 'qag_quad'
+                or 'spline'.
         """
+        self.integrator = integrator
         self.s_gamma = s_gamma
         self.fsky = fsky
         self.lmax = lmax
@@ -90,7 +93,8 @@ def get_cl_matrix(self, fname_save=None, recompute=False):
         self.cls = np.zeros([self.n_ell, self.n_samples, self.n_samples])
         for i in range(self.n_samples):
             for j in range(i, self.n_samples):
-                cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr)
+                cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr,
+                                    limber_integration_method=self.integrator)
                 self.cls[:, i, j] = cl
                 if j != i:
                     self.cls[:, j, i] = cl
diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py
index 1aca43c0..0710ab3f 100644
--- a/tomo_challenge/snc.py
+++ b/tomo_challenge/snc.py
@@ -14,7 +14,7 @@ class SnCalc(object):
     edges_large = 100.
 
     def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
-            s_gamma=0.26, use_clustering=False):
+                 s_gamma=0.26, use_clustering=False, integrator='spline'):
         """ S/N calculator
         Args:
             z_arr (array_like): array of redshifts at which all the N(z)s are
@@ -32,7 +32,10 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
                 `use_clustering=False`).
             use_clustering (bool): if `True`, SNR will be computed for
                 clustering instead of lensing.
+            integrator (string): CCL integration method. Either 'qag_quad'
+                or 'spline'.
         """
+        self.integrator = integrator
         self.s_gamma = s_gamma
         self.fsky = fsky
         self.lmax = lmax
@@ -90,7 +93,8 @@ def get_cl_matrix(self, fname_save=None, recompute=False):
         self.cls = np.zeros([self.n_ell, self.n_samples, self.n_samples])
         for i in range(self.n_samples):
             for j in range(i, self.n_samples):
-                cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr)
+                cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr,
+                                    limber_integration_method=self.integrator)
                 self.cls[:, i, j] = cl
                 if j != i:
                     self.cls[:, j, i] = cl

From a6045b788af1a833a9e678936b8297b92ac121e9 Mon Sep 17 00:00:00 2001
From: David Alonso <dam.phys@gmail.com>
Date: Mon, 31 Aug 2020 22:12:29 +0100
Subject: [PATCH 17/35] flake8

---
 the_atlantics/snc.py  | 40 ++++++++++++++++++++--------------------
 tomo_challenge/snc.py | 40 ++++++++++++++++++++--------------------
 2 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py
index 8ef2d31d..dc05921d 100644
--- a/the_atlantics/snc.py
+++ b/the_atlantics/snc.py
@@ -5,9 +5,9 @@
 
 
 assign_params_default = {'p_inbin_thr': 0.5,
-        'p_outbin_thr': 0.2,
-        'use_p_inbin': False,
-        'use_p_outbin': False}
+                         'p_outbin_thr': 0.2,
+                         'use_p_inbin': False,
+                         'use_p_outbin': False}
 
 
 class SnCalc(object):
@@ -60,7 +60,7 @@ def _bz_model(self, cosmo, z):
     def get_cl_matrix(self, fname_save=None, recompute=False):
         """ Computes matrix of power spectra between all the initial groups.
         Args:
-                        fname_save (string): if not None, the result will be saved to a
+            fname_save (string): if not None, the result will be saved to a
                 file by this name, and if present and `recompute=False`, the
                 matrix will be read from that file.
             recompute (bool): if `True`, any previous calculations of the
@@ -74,22 +74,22 @@ def get_cl_matrix(self, fname_save=None, recompute=False):
 
         # Cosmology
         cosmo = ccl.Cosmology(Omega_c=0.25,
-                Omega_b=0.05,
-                h=0.67, n_s=0.96,
-                sigma8=0.81)
+                              Omega_b=0.05,
+                              h=0.67, n_s=0.96,
+                              sigma8=0.81)
 
         # Tracers
         if self.use_clustering:
             trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz),
-                bias=(self.z_arr,
-                    self._bz_model(cosmo,
-                        self.z_arr)))
-                    for nz in self.nz_list]
+                                          bias=(self.z_arr,
+                                                self._bz_model(cosmo,
+                                                               self.z_arr)))
+                   for nz in self.nz_list]
         else:
             trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz))
-                    for nz in self.nz_list]
+                   for nz in self.nz_list]
 
-            # Cls
+        # Cls
         self.cls = np.zeros([self.n_ell, self.n_samples, self.n_samples])
         for i in range(self.n_samples):
             for j in range(i, self.n_samples):
@@ -161,10 +161,10 @@ def get_sn_wn(self, weights, n_dens, full_output=False):
         icl = np.linalg.inv(cl)
         sn2_1pt = np.sum(sl[:, :, :, None] * icl[:, None, :, :], axis=2)
         sn2_ell = np.sum(sn2_1pt[:, :, :, None] * sn2_1pt[:, None, :, :],
-                axis=2)
+                         axis=2)
         trsn2_ell = np.trace(sn2_ell, axis1=1, axis2=2)
         snr = np.sqrt(np.sum(trsn2_ell * (2*self.larr + 1) *
-            self.d_ell / self.fsky))
+                             self.d_ell / self.fsky))
 
         if full_output:
             dret = {'snr': snr, 'cl': sl, 'nl': nl, 'ls': self.larr}
@@ -175,26 +175,26 @@ def get_sn_wn(self, weights, n_dens, full_output=False):
     def get_sn(self, assign, full_output=False):
         """ Compute signal-to-noise ratio from a bin assignment list.
         Args:
-                        assign (list): list of arrays. Each element of the list should
+            assign (list): list of arrays. Each element of the list should
                 be a numpy array of integers, corresponding to the indices
                 of the initial groups that are now resampled into a larger
                 group.
             full_output (bool): if true, a dictionary with additional
                 information will be returned. Otherwise just total S/N.
         Returns:
-                        If `full_output=True`, dictionary containing S/N, power
+            If `full_output=True`, dictionary containing S/N, power
             spectra and noise spectra. Otherwise just S/N.
         """
         w, ndens = self.get_resample_metadata(assign)
         return self.get_sn_wn(w, ndens,
-                full_output=full_output)
+                              full_output=full_output)
 
     def check_edges(self, edges):
         """ Returns `True` if there's something wrong with the edges.
         """
         return np.any(edges < 0) or \
-                np.any(edges > self.edges_large) or \
-                np.any(np.diff(edges) < 0)
+            np.any(edges > self.edges_large) or \
+            np.any(np.diff(edges) < 0)
 
     def get_nzs_from_edges(self, edges, assign_params=assign_params_default):
         assign = self.assign_from_edges(edges, assign_params=assign_params)
diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py
index 8ef2d31d..dc05921d 100644
--- a/tomo_challenge/snc.py
+++ b/tomo_challenge/snc.py
@@ -5,9 +5,9 @@
 
 
 assign_params_default = {'p_inbin_thr': 0.5,
-        'p_outbin_thr': 0.2,
-        'use_p_inbin': False,
-        'use_p_outbin': False}
+                         'p_outbin_thr': 0.2,
+                         'use_p_inbin': False,
+                         'use_p_outbin': False}
 
 
 class SnCalc(object):
@@ -60,7 +60,7 @@ def _bz_model(self, cosmo, z):
     def get_cl_matrix(self, fname_save=None, recompute=False):
         """ Computes matrix of power spectra between all the initial groups.
         Args:
-                        fname_save (string): if not None, the result will be saved to a
+            fname_save (string): if not None, the result will be saved to a
                 file by this name, and if present and `recompute=False`, the
                 matrix will be read from that file.
             recompute (bool): if `True`, any previous calculations of the
@@ -74,22 +74,22 @@ def get_cl_matrix(self, fname_save=None, recompute=False):
 
         # Cosmology
         cosmo = ccl.Cosmology(Omega_c=0.25,
-                Omega_b=0.05,
-                h=0.67, n_s=0.96,
-                sigma8=0.81)
+                              Omega_b=0.05,
+                              h=0.67, n_s=0.96,
+                              sigma8=0.81)
 
         # Tracers
         if self.use_clustering:
             trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz),
-                bias=(self.z_arr,
-                    self._bz_model(cosmo,
-                        self.z_arr)))
-                    for nz in self.nz_list]
+                                          bias=(self.z_arr,
+                                                self._bz_model(cosmo,
+                                                               self.z_arr)))
+                   for nz in self.nz_list]
         else:
             trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz))
-                    for nz in self.nz_list]
+                   for nz in self.nz_list]
 
-            # Cls
+        # Cls
         self.cls = np.zeros([self.n_ell, self.n_samples, self.n_samples])
         for i in range(self.n_samples):
             for j in range(i, self.n_samples):
@@ -161,10 +161,10 @@ def get_sn_wn(self, weights, n_dens, full_output=False):
         icl = np.linalg.inv(cl)
         sn2_1pt = np.sum(sl[:, :, :, None] * icl[:, None, :, :], axis=2)
         sn2_ell = np.sum(sn2_1pt[:, :, :, None] * sn2_1pt[:, None, :, :],
-                axis=2)
+                         axis=2)
         trsn2_ell = np.trace(sn2_ell, axis1=1, axis2=2)
         snr = np.sqrt(np.sum(trsn2_ell * (2*self.larr + 1) *
-            self.d_ell / self.fsky))
+                             self.d_ell / self.fsky))
 
         if full_output:
             dret = {'snr': snr, 'cl': sl, 'nl': nl, 'ls': self.larr}
@@ -175,26 +175,26 @@ def get_sn_wn(self, weights, n_dens, full_output=False):
     def get_sn(self, assign, full_output=False):
         """ Compute signal-to-noise ratio from a bin assignment list.
         Args:
-                        assign (list): list of arrays. Each element of the list should
+            assign (list): list of arrays. Each element of the list should
                 be a numpy array of integers, corresponding to the indices
                 of the initial groups that are now resampled into a larger
                 group.
             full_output (bool): if true, a dictionary with additional
                 information will be returned. Otherwise just total S/N.
         Returns:
-                        If `full_output=True`, dictionary containing S/N, power
+            If `full_output=True`, dictionary containing S/N, power
             spectra and noise spectra. Otherwise just S/N.
         """
         w, ndens = self.get_resample_metadata(assign)
         return self.get_sn_wn(w, ndens,
-                full_output=full_output)
+                              full_output=full_output)
 
     def check_edges(self, edges):
         """ Returns `True` if there's something wrong with the edges.
         """
         return np.any(edges < 0) or \
-                np.any(edges > self.edges_large) or \
-                np.any(np.diff(edges) < 0)
+            np.any(edges > self.edges_large) or \
+            np.any(np.diff(edges) < 0)
 
     def get_nzs_from_edges(self, edges, assign_params=assign_params_default):
         assign = self.assign_from_edges(edges, assign_params=assign_params)

From 42913cc3d6ed8c75823a7ea186021aec6e9083a5 Mon Sep 17 00:00:00 2001
From: Angus Wright <awright@astro.ruhr-uni-bochum.de>
Date: Tue, 1 Sep 2020 11:41:50 +0200
Subject: [PATCH 18/35] updated gitignore

---
 .gitignore | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index bc56b5a7..2b112cc2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
+data/*
 plots/*
-
+wl_*.npz
+*.pkl

From dc0512853dd1db13fba319be648c7793e15f0978 Mon Sep 17 00:00:00 2001
From: Angus Wright <awright@astro.ruhr-uni-bochum.de>
Date: Tue, 1 Sep 2020 11:42:20 +0200
Subject: [PATCH 19/35] updated gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 2b112cc2..94242df7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+data
 data/*
 plots/*
 wl_*.npz

From 453e89927a3cf873a4bb7a3d4e03bb901c0de158 Mon Sep 17 00:00:00 2001
From: Angus Wright <awright@astro.ruhr-uni-bochum.de>
Date: Tue, 1 Sep 2020 11:48:33 +0200
Subject: [PATCH 20/35] added save of SOM

---
 example/ComplexSOM.yaml                  | 12 ++++---
 tomo_challenge/classifiers/ComplexSOM.py | 41 +++++++++++++++++++-----
 2 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/example/ComplexSOM.yaml b/example/ComplexSOM.yaml
index e19130f4..4e386b5f 100644
--- a/example/ComplexSOM.yaml
+++ b/example/ComplexSOM.yaml
@@ -1,8 +1,8 @@
 metrics:  [SNR_3x2]
 bands: riz
 training_file: data/training.hdf5
-validation_file: data/validation_small.hdf5
-output_file: example/ComplexSOM_output.txt
+validation_file: data/validation.hdf5
+output_file: example/ComplexSOM_output_manygroup.txt
 # Backend implementing the metrics, either: "firecrown" (default), "jax-cosmo"
 metrics_impl: firecrown
 
@@ -11,16 +11,18 @@ run:
   ComplexSOM:
     run_3:
       # These settings are sent to the classifier
-      bins: 5
-      som_dim: [51,51]
+      bins: 7
+      som_dim: [21,21]
       num_groups: 100
       num_threads: 32
       group_type: 'redshift'
       data_threshold: [0,30]
-      sparse_frac: 0.15
+      sparse_frac: 0.05
       plots: True 
       plot_dir: 'plots' 
       metric: "SNR_3x2"
+      use_inbin: True
+      use_outbin: True
       # These special settings decide whether the
       # color and error colums are passed to the classifier
       # as well as the magnitudes
diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py
index ea37a806..18704aa1 100644
--- a/tomo_challenge/classifiers/ComplexSOM.py
+++ b/tomo_challenge/classifiers/ComplexSOM.py
@@ -25,6 +25,7 @@
 from ..snc import SnCalc 
 from scipy.optimize import minimize
 from scipy.integrate import simps 
+import pickle
 
 #Check that all the needed packages are installed
 # R package nameo
@@ -50,7 +51,8 @@ class ComplexSOM(Tomographer):
     
     # valid parameter -- see below
     valid_options = ['bins','som_dim','num_groups','num_threads',
-            'group_type','data_threshold','sparse_frac','plots','plot_dir','metric']
+            'group_type','data_threshold','sparse_frac','plots',
+            'plot_dir','metric','use_inbin','use_outbin']
     # this settings means arrays will be sent to train and apply instead
     # of dictionaries
     wants_arrays = False
@@ -109,6 +111,10 @@ def train (self, training_data, training_z):
         data_threshold = self.opt['data_threshold']
         #Metric
         metric = self.opt['metric']
+        #Flag bad bins with inbin probability
+        use_inbin = self.opt['use_inbin']
+        #Flag bad bins with outbin probability
+        use_outbin = self.opt['use_outbin']
         #Plots
         plots = self.opt['plots']
         #Plot Output Directory
@@ -118,6 +124,11 @@ def train (self, training_data, training_z):
         #Check that the group type is a valid choice
         if group_type != 'colour' and group_type != 'redshift':
             group_type = 'redshift'
+        #Define the assign_params variable, used in bin optimisation 
+        assign_params = {'p_inbin_thr': 0.5,
+                'p_outbin_thr': 0.2,
+                'use_p_inbin': use_inbin,
+                'use_p_outbin': use_outbin}
 
         #Define the redshift summary statistics (used for making groups in the 'redshift' case
         property_labels = ("mean_z_true","med_z_true","sd_z_true","mad_z_true","iqr_z_true")
@@ -190,10 +201,22 @@ def train (self, training_data, training_z):
               #train_df = ro.conversion.py2rpy(train[['u_mag','g_mag','r_mag','i_mag','z_mag','y_mag']])
               train_df = ro.conversion.py2rpy(training_data)
 
-        print("Training the SOM using R kohtrain")
-        #Train the SOM using R kohtrain
-        som=kohonen.kohtrain(data=train_df,som_dim=IntVector(som_dim),max_na_frac=0,data_threshold=FloatVector(data_threshold),
-                    n_cores=num_threads,train_expr=StrVector(expressions),train_sparse=False,sparse_frac=sparse_frac)
+        #Construct or Load the SOM 
+        som_outname = f"SOM_{som_dim}_{self.bands}.pkl"
+        if not os.path.exists(som_outname):
+            print("Training the SOM using R kohtrain")
+            #Train the SOM using R kohtrain
+            som=kohonen.kohtrain(data=train_df,som_dim=IntVector(som_dim),max_na_frac=0,data_threshold=FloatVector(data_threshold),
+                        n_cores=num_threads,train_expr=StrVector(expressions),train_sparse=False,sparse_frac=sparse_frac)
+            #Output the SOM 
+            #base.save(som,file=som_outname)
+            with open(som_outname, 'wb') as f:
+                pickle.dump(som, f)
+        else:
+            print("Loading the pretrained SOM")
+            with open(som_outname, 'rb') as f:
+                som = pickle.load(f)
+            som.rx2['unit.classif']=FloatVector([])
 
         #If grouping by redshift, construct the cell redshift statistics
         if group_type == 'redshift' or plots == True:
@@ -203,7 +226,9 @@ def train (self, training_data, training_z):
                         expression=StrVector(property_expressions),expr_label=StrVector(property_labels))
             print("Constructing redshift-based hierarchical cluster tree")
             #Cluster the SOM cells into num_groups groups
-            props = cell_prop.rx2['property']
+            props = kohonen.kohwhiten(cell_prop.rx2['property'],train_expr=base.colnames(cell_prop.rx2['property']),
+                    data_missing='NA',data_threshold=FloatVector([0,12]))
+            props = props.rx2("data.white")
             props.rx[base.which(base.is_na(props))] = -1
             print(base.summary(props))
             hclust=stats.hclust(stats.dist(props))
@@ -306,13 +331,13 @@ def train (self, training_data, training_z):
         c_wl.get_cl_matrix(fname_save='wl_nb%d.npz' % n_bin)
         # Finally, let's write the function to minimize and optimize for a 4-bin case.
         def minus_sn(edges, calc):
-            return -calc.get_sn_from_edges(edges)
+            return -calc.get_sn_from_edges(edges,assign_params=assign_params)
 
         print("Optimizing WL")
         edges_0 = np.linspace(0, 2, n_bin-1)
         res = minimize(minus_sn, edges_0, method='Powell', args=(c_wl,))
         print("WL final edges: ", res.x)
-        print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x)*np.sqrt(0.25/fsky))
+        print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x)*np.sqrt(0.25/fsky),assign_params=assign_params)
         print(" ")
         print(res)
 

From 26f138592493ba75507403a495bb1c867bfde7f9 Mon Sep 17 00:00:00 2001
From: Angus Wright <awright@astro.ruhr-uni-bochum.de>
Date: Tue, 1 Sep 2020 11:49:04 +0200
Subject: [PATCH 21/35] changed settings

---
 example/ComplexSOM.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/example/ComplexSOM.yaml b/example/ComplexSOM.yaml
index 4e386b5f..56f20ff9 100644
--- a/example/ComplexSOM.yaml
+++ b/example/ComplexSOM.yaml
@@ -12,12 +12,12 @@ run:
     run_3:
       # These settings are sent to the classifier
       bins: 7
-      som_dim: [21,21]
+      som_dim: [51,51]
       num_groups: 100
       num_threads: 32
       group_type: 'redshift'
       data_threshold: [0,30]
-      sparse_frac: 0.05
+      sparse_frac: 0.25
       plots: True 
       plot_dir: 'plots' 
       metric: "SNR_3x2"

From a2d7355ee6d932ca2e06e2b2b399d0cfdcb95262 Mon Sep 17 00:00:00 2001
From: Angus Wright <awright@astro.ruhr-uni-bochum.de>
Date: Tue, 1 Sep 2020 15:10:16 +0200
Subject: [PATCH 22/35] fix bug in SNR calculation call

---
 tomo_challenge/classifiers/ComplexSOM.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py
index 18704aa1..d83d3d7f 100644
--- a/tomo_challenge/classifiers/ComplexSOM.py
+++ b/tomo_challenge/classifiers/ComplexSOM.py
@@ -337,7 +337,7 @@ def minus_sn(edges, calc):
         edges_0 = np.linspace(0, 2, n_bin-1)
         res = minimize(minus_sn, edges_0, method='Powell', args=(c_wl,))
         print("WL final edges: ", res.x)
-        print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x)*np.sqrt(0.25/fsky),assign_params=assign_params)
+        print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x,assign_params=assign_params)*np.sqrt(0.25/fsky))
         print(" ")
         print(res)
 

From 7fae683fc9c876a09777925eb00b3d98a8cad969 Mon Sep 17 00:00:00 2001
From: Angus Wright <awright@astro.ruhr-uni-bochum.de>
Date: Tue, 1 Sep 2020 15:47:05 +0200
Subject: [PATCH 23/35] speed up when using pretrained SOM

---
 tomo_challenge/classifiers/ComplexSOM.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py
index d83d3d7f..a3064b07 100644
--- a/tomo_challenge/classifiers/ComplexSOM.py
+++ b/tomo_challenge/classifiers/ComplexSOM.py
@@ -224,6 +224,8 @@ def train (self, training_data, training_z):
             #Construct the Nz properties per SOM cell
             cell_prop=kohonen.generate_kohgroup_property(som=som,data=train_df,
                         expression=StrVector(property_expressions),expr_label=StrVector(property_labels))
+            som=cell_prop.rx2('som')
+            som.rx2['clust.classif']=FloatVector([])
             print("Constructing redshift-based hierarchical cluster tree")
             #Cluster the SOM cells into num_groups groups
             props = kohonen.kohwhiten(cell_prop.rx2['property'],train_expr=base.colnames(cell_prop.rx2['property']),

From 83d1bf650a72ee4f88053b99e7977758e05398ed Mon Sep 17 00:00:00 2001
From: David Alonso <dam.phys@gmail.com>
Date: Tue, 1 Sep 2020 15:04:56 +0100
Subject: [PATCH 24/35] safe integration

---
 the_atlantics/snc.py  | 12 ++++++------
 tomo_challenge/snc.py | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py
index dc05921d..db83e1c8 100644
--- a/the_atlantics/snc.py
+++ b/the_atlantics/snc.py
@@ -219,23 +219,23 @@ def assign_from_edges(self, edges, assign_params=assign_params_default,
         # Bin IDs based on mean z
         ids = np.digitize(self.z_means, bins=edges)
         if assign_params['use_p_inbin'] or assign_params['use_p_outbin']:
+            def integrate_safe(p, z, m):
+                if np.sum(m) == 0:
+                    return 0
+                else:
+                    return simps(p[m], x=z[m])
             # Calculate probabilities in each bin
             zgroups = np.digitize(self.z_arr, bins=edges)
             masks = [zgroups == i for i in range(nbins)]
             if assign_params['use_p_outbin']:
                 # Matrix of probabilities in each bin
-                pzs = np.array([[simps(pz[m], x=self.z_arr[m])
+                pzs = np.array([[integrate_safe(pz, self.z_arr, m)
                                  for m in masks]
                                 for pz in self.pz_list])
                 pzd = np.array([pzs[j, ids[j]]
                                 for j in range(self.n_samples)])
             else:
                 # Overlaps in own bin
-                def integrate_safe(p, z, m):
-                    if np.sum(m) == 0:
-                        return 0
-                    else:
-                        return simps(p[m], x=z[m])
                 pzd = np.array([integrate_safe(pz, self.z_arr, masks[i])
                                 for pz, i in zip(self.pz_list, ids)])
             # Throw away based on in-bin probability
diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py
index dc05921d..db83e1c8 100644
--- a/tomo_challenge/snc.py
+++ b/tomo_challenge/snc.py
@@ -219,23 +219,23 @@ def assign_from_edges(self, edges, assign_params=assign_params_default,
         # Bin IDs based on mean z
         ids = np.digitize(self.z_means, bins=edges)
         if assign_params['use_p_inbin'] or assign_params['use_p_outbin']:
+            def integrate_safe(p, z, m):
+                if np.sum(m) == 0:
+                    return 0
+                else:
+                    return simps(p[m], x=z[m])
             # Calculate probabilities in each bin
             zgroups = np.digitize(self.z_arr, bins=edges)
             masks = [zgroups == i for i in range(nbins)]
             if assign_params['use_p_outbin']:
                 # Matrix of probabilities in each bin
-                pzs = np.array([[simps(pz[m], x=self.z_arr[m])
+                pzs = np.array([[integrate_safe(pz, self.z_arr, m)
                                  for m in masks]
                                 for pz in self.pz_list])
                 pzd = np.array([pzs[j, ids[j]]
                                 for j in range(self.n_samples)])
             else:
                 # Overlaps in own bin
-                def integrate_safe(p, z, m):
-                    if np.sum(m) == 0:
-                        return 0
-                    else:
-                        return simps(p[m], x=z[m])
                 pzd = np.array([integrate_safe(pz, self.z_arr, masks[i])
                                 for pz, i in zip(self.pz_list, ids)])
             # Throw away based on in-bin probability

From 0bb9f7999883cc660527b26d0c2ffa8201efcb95 Mon Sep 17 00:00:00 2001
From: Angus Wright <awright@astro.ruhr-uni-bochum.de>
Date: Fri, 4 Sep 2020 18:06:03 +0200
Subject: [PATCH 25/35] Merge branch 'the_atlantics' of
 github.com:damonge/tomo_challenge into the_atlantics

---
 example/ComplexSOM.yaml                  | 14 +++++++-------
 tomo_challenge/classifiers/ComplexSOM.py |  6 +++---
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/example/ComplexSOM.yaml b/example/ComplexSOM.yaml
index 56f20ff9..5670a39c 100644
--- a/example/ComplexSOM.yaml
+++ b/example/ComplexSOM.yaml
@@ -1,8 +1,8 @@
 metrics:  [SNR_3x2]
 bands: riz
-training_file: data/training.hdf5
-validation_file: data/validation.hdf5
-output_file: example/ComplexSOM_output_manygroup.txt
+training_file: data_buzzard/training.hdf5
+validation_file: data_buzzard/validation.hdf5
+output_file: example/ComplexSOM_output.txt
 # Backend implementing the metrics, either: "firecrown" (default), "jax-cosmo"
 metrics_impl: firecrown
 
@@ -11,18 +11,18 @@ run:
   ComplexSOM:
     run_3:
       # These settings are sent to the classifier
-      bins: 7
+      bins: 5
       som_dim: [51,51]
       num_groups: 100
       num_threads: 32
       group_type: 'redshift'
       data_threshold: [0,30]
-      sparse_frac: 0.25
+      sparse_frac: 0.5
       plots: True 
       plot_dir: 'plots' 
       metric: "SNR_3x2"
-      use_inbin: True
-      use_outbin: True
+      use_inbin: False
+      use_outbin: False
       # These special settings decide whether the
       # color and error colums are passed to the classifier
       # as well as the magnitudes
diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py
index a3064b07..d82fb034 100644
--- a/tomo_challenge/classifiers/ComplexSOM.py
+++ b/tomo_challenge/classifiers/ComplexSOM.py
@@ -223,7 +223,7 @@ def train (self, training_data, training_z):
             print("Constructing cell-based redshift properties")
             #Construct the Nz properties per SOM cell
             cell_prop=kohonen.generate_kohgroup_property(som=som,data=train_df,
-                        expression=StrVector(property_expressions),expr_label=StrVector(property_labels))
+                        expression=StrVector(property_expressions),expr_label=StrVector(property_labels),returnMatrix=True)
             som=cell_prop.rx2('som')
             som.rx2['clust.classif']=FloatVector([])
             print("Constructing redshift-based hierarchical cluster tree")
@@ -244,7 +244,7 @@ def train (self, training_data, training_z):
         print("Constructing group-based redshift properties")
         group_prop=kohonen.generate_kohgroup_property(som=som,data=train_df,
             expression=StrVector(property_expressions),expr_label=StrVector(property_labels),
-            n_cluster_bins=num_groups)
+            n_cluster_bins=num_groups,returnMatrix=True)
 
         #extract the training som (just for convenience)
         train_som = group_prop.rx2('som')
@@ -387,7 +387,7 @@ def apply (self, data):
         #Generate the validation associations/groups
         group_prop=kohonen.generate_kohgroup_property(som=self.train_som,data=data_df,
             expression="nrow(data)",expr_label="N",
-            n_cluster_bins=num_groups,n_cores=self.opt['num_threads'])
+            n_cluster_bins=num_groups,n_cores=self.opt['num_threads'],returnMatrix=True)
 
         #extract the validation som (just for convenience)
         valid_som = group_prop.rx2('som')

From 038dcf669f10cbbed05545bac0195b2ff4ad334b Mon Sep 17 00:00:00 2001
From: Angus Wright <awright@astro.ruhr-uni-bochum.de>
Date: Tue, 8 Sep 2020 15:07:11 +0200
Subject: [PATCH 26/35] added testing mode

---
 bin/challenge.py                         |  20 ++-
 example/ComplexSOM.yaml                  |  29 +++-
 tomo_challenge/classifiers/ComplexSOM.py | 172 ++++++++++++++---------
 tomo_challenge/data.py                   |  11 +-
 4 files changed, 151 insertions(+), 81 deletions(-)

diff --git a/bin/challenge.py b/bin/challenge.py
index 5ec60273..dcabe9e4 100755
--- a/bin/challenge.py
+++ b/bin/challenge.py
@@ -27,12 +27,15 @@ def main(config_yaml):
     # Decide if anyone needs the colors calculating and/or errors loading
     anyone_wants_colors = False
     anyone_wants_errors = False
+    anyone_wants_sizes = False
     for run in config['run'].values():
         for version in run.values():
             if version.get('errors'):
                 anyone_wants_errors = True
             if version.get('colors'):
                 anyone_wants_colors = True
+            if version.get('sizes'):
+                anyone_wants_sizes = True
 
 
     bands = config['bands']
@@ -41,14 +44,16 @@ def main(config_yaml):
         config['training_file'],
         bands,
         errors=anyone_wants_errors,
-        colors=anyone_wants_colors
+        colors=anyone_wants_colors,
+        size=anyone_wants_sizes
     )
 
     validation_data = tc.load_data(
         config['validation_file'],
         bands,
         errors=anyone_wants_errors,
-        colors=anyone_wants_colors
+        colors=anyone_wants_colors,
+        size=anyone_wants_sizes
     )
 
     training_z = tc.load_redshift(config['training_file'])
@@ -59,7 +64,7 @@ def main(config_yaml):
     else:
         metrics_fn = tc.compute_scores
 
-    with open(config['output_file'],'w') as output_file:
+    with open(config['output_file'],'a') as output_file:
         for classifier_name, runs in config['run'].items():
             for run, settings in runs.items():
                 scores = run_one(classifier_name, bands, settings,
@@ -77,15 +82,16 @@ def run_one(classifier_name, bands, settings, train_data, train_z, valid_data,
     if classifier.wants_arrays:
         errors = settings.get('errors')
         colors = settings.get('colors')
-        train_data = tc.dict_to_array(train_data, bands, errors=errors, colors=colors)
-        valid_data = tc.dict_to_array(valid_data, bands, errors=errors, colors=colors)
+        sizes = settings.get('sizes')
+        train_data = tc.dict_to_array(train_data, bands, errors=errors, colors=colors, sizes=sizes)
+        valid_data = tc.dict_to_array(valid_data, bands, errors=errors, colors=colors, sizes=sizes)
 
     print ("Executing: ", classifier_name, bands, settings)
 
     ## first check if options are valid
     print (settings, classifier.valid_options)
     for key in settings.keys():
-        if key not in classifier.valid_options and key not in ['errors', 'colors']:
+        if key not in classifier.valid_options and key not in ['errors', 'colors', 'sizes']:
             raise ValueError(f"Key {key} is not recognized by classifier {classifier_name}")
 
     print ("Initializing classifier...")
@@ -102,7 +108,7 @@ def run_one(classifier_name, bands, settings, train_data, train_z, valid_data,
 
     print ("Making some pretty plots...")
     name = str(classifier.__name__)
-    tc.metrics.plot_distributions(valid_z, results, f"plots/{name}_{settings}_{bands}.png")
+    tc.metrics.plot_distributions(valid_z, results, f"plots/{name}_{bands}.png")
 
     return scores
 
diff --git a/example/ComplexSOM.yaml b/example/ComplexSOM.yaml
index 5670a39c..0f19205f 100644
--- a/example/ComplexSOM.yaml
+++ b/example/ComplexSOM.yaml
@@ -9,15 +9,16 @@ metrics_impl: firecrown
 run:
   # This is a class name which will be looked up
   ComplexSOM:
-    run_3:
+    run_5:
       # These settings are sent to the classifier
+      testing: True
       bins: 5
       som_dim: [51,51]
-      num_groups: 100
+      num_groups: 200
       num_threads: 32
       group_type: 'redshift'
       data_threshold: [0,30]
-      sparse_frac: 0.5
+      sparse_frac: 1.0
       plots: True 
       plot_dir: 'plots' 
       metric: "SNR_3x2"
@@ -28,4 +29,26 @@ run:
       # as well as the magnitudes
       colors: False
       errors: False
+      sizes: False
+    run_10:
+      # These settings are sent to the classifier
+      testing: False
+      bins: 10
+      som_dim: [51,51]
+      num_groups: 200
+      num_threads: 32
+      group_type: 'redshift'
+      data_threshold: [0,30]
+      sparse_frac: 1.0
+      plots: True 
+      plot_dir: 'plots' 
+      metric: "SNR_3x2"
+      use_inbin: False
+      use_outbin: False
+      # These special settings decide whether the
+      # color and error colums are passed to the classifier
+      # as well as the magnitudes
+      colors: False
+      errors: False
+      sizes: False
 
diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py
index d82fb034..915ba17e 100644
--- a/tomo_challenge/classifiers/ComplexSOM.py
+++ b/tomo_challenge/classifiers/ComplexSOM.py
@@ -52,7 +52,7 @@ class ComplexSOM(Tomographer):
     # valid parameter -- see below
     valid_options = ['bins','som_dim','num_groups','num_threads',
             'group_type','data_threshold','sparse_frac','plots',
-            'plot_dir','metric','use_inbin','use_outbin']
+            'plot_dir','metric','use_inbin','use_outbin','testing']
     # this settings means arrays will be sent to train and apply instead
     # of dictionaries
     wants_arrays = False
@@ -138,85 +138,117 @@ def train (self, training_data, training_z):
         #Define the SOM variables
         if self.bands == 'riz':
             #riz bands
-            #expressions = ("r_mag-i_mag","r_mag-z_mag","i_mag-z_mag",
-            #               "z_mag","r_mag-i_mag-(i_mag-z_mag)")
-            expressions = ("r-i","r-z","i-z",
-                           "z","r-i-(i-z)")
+            if self.opt['sizes'] == True:
+                expressions = ("r-i","r-z","i-z",
+                               "z","r-i-(i-z)","mcal_T")
+            else: 
+                expressions = ("r-i","r-z","i-z",
+                               "z","r-i-(i-z)")
         elif self.bands == 'griz':
             #griz bands
-            #expressions = ("g_mag-r_mag","g_mag-i_mag",
-            #               "g_mag-z_mag","r_mag-i_mag","r_mag-z_mag","i_mag-z_mag",
-            #               "z_mag","g_mag-r_mag-(r_mag-i_mag)",
-            #               "r_mag-i_mag-(i_mag-z_mag)")
-            expressions = ("g-r","g-i",
-                           "g-z","r-i","r-z","i-z",
-                           "z","g-r-(r-i)",
-                           "r-i-(i-z)")
+            if self.opt['sizes'] == True:
+                expressions = ("g-r","g-i",
+                               "g-z","r-i","r-z","i-z",
+                               "z","g-r-(r-i)",
+                               "r-i-(i-z)","mcal_T")
+            else: 
+                expressions = ("g-r","g-i",
+                               "g-z","r-i","r-z","i-z",
+                               "z","g-r-(r-i)",
+                               "r-i-(i-z)")
         elif self.bands == 'grizy':
             #grizy bands
-            #expressions = ("g_mag-r_mag","g_mag-i_mag",
-            #               "g_mag-z_mag","g_mag-y_mag","r_mag-i_mag","r_mag-z_mag","r_mag-y_mag","i_mag-z_mag","i_mag-y_mag",
-            #               "z_mag-y_mag","z_mag","g_mag-r_mag-(r_mag-i_mag)",
-            #               "r_mag-i_mag-(i_mag-z_mag)","i_mag-z_mag-(z_mag-y_mag)")
-            expressions = ("g-r","g-i",
-                           "g-z","g-y","r-i","r-z","r-y","i-z","i-y",
-                           "z-y","z","g-r-(r-i)",
-                           "r-i-(i-z)","i-z-(z-y)")
+            if self.opt['sizes'] == True:
+                expressions = ("g-r","g-i",
+                               "g-z","g-y","r-i","r-z","r-y","i-z","i-y",
+                               "z-y","z","g-r-(r-i)",
+                               "r-i-(i-z)","i-z-(z-y)","mcal_T")
+            else: 
+                expressions = ("g-r","g-i",
+                               "g-z","g-y","r-i","r-z","r-y","i-z","i-y",
+                               "z-y","z","g-r-(r-i)",
+                               "r-i-(i-z)","i-z-(z-y)")
         elif self.bands == 'ugriz':
             #ugrizy bands
-            #expressions = ("u_mag-g_mag","u_mag-r_mag","u_mag-i_mag","u_mag-z_mag","g_mag-r_mag","g_mag-i_mag",
-            #               "g_mag-z_mag","r_mag-i_mag","r_mag-z_mag","i_mag-z_mag",
-            #               "z_mag","u_mag-g_mag-(g_mag-r_mag)","g_mag-r_mag-(r_mag-i_mag)",
-            #               "r_mag-i_mag-(i_mag-z_mag)")
-            expressions = ("u-g","u-r","u-i","u-z","g-r","g-i",
-                           "g-z","r-i","r-z","i-z",
-                           "z","u-g-(g-r)","g-r-(r-i)",
-                           "r-i-(i-z)")
+            if self.opt['sizes'] == True:
+                expressions = ("u-g","u-r","u-i","u-z","g-r","g-i",
+                               "g-z","r-i","r-z","i-z",
+                               "z","u-g-(g-r)","g-r-(r-i)",
+                               "r-i-(i-z)","mcal_T")
+            else: 
+                expressions = ("u-g","u-r","u-i","u-z","g-r","g-i",
+                               "g-z","r-i","r-z","i-z",
+                               "z","u-g-(g-r)","g-r-(r-i)",
+                               "r-i-(i-z)")
         elif self.bands == 'ugrizy':
             #ugrizy bands
-            #expressions = ("u_mag-g_mag","u_mag-r_mag","u_mag-i_mag","u_mag-z_mag","u_mag-y_mag","g_mag-r_mag","g_mag-i_mag",
-            #               "g_mag-z_mag","g_mag-y_mag","r_mag-i_mag","r_mag-z_mag","r_mag-y_mag","i_mag-z_mag","i_mag-y_mag",
-            #               "z_mag-y_mag","z_mag","u_mag-g_mag-(g_mag-r_mag)","g_mag-r_mag-(r_mag-i_mag)",
-            #               "r_mag-i_mag-(i_mag-z_mag)","i_mag-z_mag-(z_mag-y_mag)")
-            expressions = ("u-g","u-r","u-i","u-z","u-y","g-r","g-i",
-                           "g-z","g-y","r-i","r-z","r-y","i-z","i-y",
-                           "z-y","z","u-g-(g-r)","g-r-(r-i)",
-                           "r-i-(i-z)","i-z-(z-y)")
-
-        print("Preparing the data")
-        training_data = pd.DataFrame.from_dict(training_data)
-        #Add the redshift variable to the train data
-        print("Adding redshift info to training data")
-        training_data['redshift_true'] = training_z
-
-        if sparse_frac < 1:
-            print("Sparse Sampling the training data")
-            cut = np.random.uniform(0, 1, training_z.size) < sparse_frac
-            training_data = training_data[cut]
-            training_z = training_z[cut]
-
-        #Construct the training data frame (just a python-to-R data conversion)
-        print("Converting the data to R format")
-        with localconverter(ro.default_converter + pandas2ri.converter):
-              #train_df = ro.conversion.py2rpy(train[['u_mag','g_mag','r_mag','i_mag','z_mag','y_mag']])
-              train_df = ro.conversion.py2rpy(training_data)
-
-        #Construct or Load the SOM 
-        som_outname = f"SOM_{som_dim}_{self.bands}.pkl"
-        if not os.path.exists(som_outname):
-            print("Training the SOM using R kohtrain")
-            #Train the SOM using R kohtrain
-            som=kohonen.kohtrain(data=train_df,som_dim=IntVector(som_dim),max_na_frac=0,data_threshold=FloatVector(data_threshold),
-                        n_cores=num_threads,train_expr=StrVector(expressions),train_sparse=False,sparse_frac=sparse_frac)
-            #Output the SOM 
-            #base.save(som,file=som_outname)
-            with open(som_outname, 'wb') as f:
-                pickle.dump(som, f)
+            if self.opt['sizes'] == True:
+                expressions = ("u-g","u-r","u-i","u-z","u-y","g-r","g-i",
+                               "g-z","g-y","r-i","r-z","r-y","i-z","i-y",
+                               "z-y","z","u-g-(g-r)","g-r-(r-i)",
+                               "r-i-(i-z)","i-z-(z-y)","mcal_T")
+            else: 
+                expressions = ("u-g","u-r","u-i","u-z","u-y","g-r","g-i",
+                               "g-z","g-y","r-i","r-z","r-y","i-z","i-y",
+                               "z-y","z","u-g-(g-r)","g-r-(r-i)",
+                               "r-i-(i-z)","i-z-(z-y)")
+
+
+        #Define the output SOM name
+        if self.opt['sizes'] == True: 
+            som_outname = f"SOM_{som_dim}_{self.bands}_withsize.pkl"
         else:
-            print("Loading the pretrained SOM")
-            with open(som_outname, 'rb') as f:
+            som_outname = f"SOM_{som_dim}_{self.bands}.pkl"
+
+        #If Testing, load the pretrained SOM and matching training catalogue
+        if self.opt['testing'] == True:
+            #Use the default training set 
+            with open(f"test_{som_outname}", 'rb') as f:
                 som = pickle.load(f)
-            som.rx2['unit.classif']=FloatVector([])
+                print(base.length(som.rx2['unit.classif']))
+            with open("training_testcat.pkl", 'rb') as f:
+                train_df = pickle.load(f)
+                print(utils.str(train_df))
+        else: 
+            #Use the training set provided
+            print("Preparing the data")
+            training_data = pd.DataFrame.from_dict(training_data)
+            #Add the redshift variable to the train data
+            print("Adding redshift info to training data")
+            training_data['redshift_true'] = training_z
+
+            if sparse_frac < 1:
+                print("Sparse Sampling the training data")
+                cut = np.random.uniform(0, 1, training_z.size) < sparse_frac
+                training_data = training_data[cut]
+                training_z = training_z[cut]
+
+            #Construct the training data frame (just a python-to-R data conversion)
+            print("Converting the data to R format")
+            with localconverter(ro.default_converter + pandas2ri.converter):
+                  train_df = ro.conversion.py2rpy(training_data)
+
+            #with open("training_testcat.pkl", 'wb') as f:
+            #    pickle.dump(train_df, f)
+
+            #Construct or Load the SOM 
+            if not os.path.exists(som_outname):
+                print("Training the SOM using R kohtrain")
+                #Train the SOM using R kohtrain
+                som=kohonen.kohtrain(data=train_df,som_dim=IntVector(som_dim),max_na_frac=0,data_threshold=FloatVector(data_threshold),
+                            n_cores=num_threads,train_expr=StrVector(expressions),train_sparse=False,sparse_frac=sparse_frac)
+                print(base.length(som.rx2['unit.classif']))
+                #Output the SOM 
+                #base.save(som,file=som_outname)
+                #with open(f"test_{som_outname}", 'wb') as f:
+                #    pickle.dump(som, f)
+                with open(som_outname, 'wb') as f:
+                    pickle.dump(som, f)
+            else:
+                print("Loading the pretrained SOM")
+                with open(som_outname, 'rb') as f:
+                    som = pickle.load(f)
+                som.rx2['unit.classif']=FloatVector([])
 
         #If grouping by redshift, construct the cell redshift statistics
         if group_type == 'redshift' or plots == True:
diff --git a/tomo_challenge/data.py b/tomo_challenge/data.py
index 03eadc5c..5b8a7af4 100644
--- a/tomo_challenge/data.py
+++ b/tomo_challenge/data.py
@@ -124,6 +124,12 @@ def add_colors(data, bands, errors=False):
         if errors:
             data[f'{b}{c}_err'] = np.sqrt(data[f'{b}_err']**2 + data[f'{c}_err']**2)
 
+def add_size(data, filename):
+
+    with h5py.File(filename, 'r') as f:
+        # load all bands
+        data['mcal_T'] = f[f'mcal_T'][:]
+
 
 def dict_to_array(data, bands, errors=False, colors=False):
     nobj = data[bands[0]].size
@@ -165,12 +171,15 @@ def colors_for_bands(bands):
 
 
 
-def load_data(filename, bands, colors=False, errors=False, array=False):
+def load_data(filename, bands, colors=False, errors=False, size=False, array=False):
     data = load_mags(filename, bands, errors=errors)
 
     if colors:
         add_colors(data, bands, errors=errors)
 
+    if size:
+        add_size(data, filename)
+
     if array:
         data = dict_to_array(data, bands, errors=errors, colors=colors)
 

From 51f357a750b3fa81ac81dbade8d287f5346162de Mon Sep 17 00:00:00 2001
From: Angus Wright <awright@astro.ruhr-uni-bochum.de>
Date: Tue, 8 Sep 2020 15:07:54 +0200
Subject: [PATCH 27/35] lowered number of groups to 100

---
 example/ComplexSOM.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/ComplexSOM.yaml b/example/ComplexSOM.yaml
index 0f19205f..e6aea060 100644
--- a/example/ComplexSOM.yaml
+++ b/example/ComplexSOM.yaml
@@ -14,7 +14,7 @@ run:
       testing: True
       bins: 5
       som_dim: [51,51]
-      num_groups: 200
+      num_groups: 100
       num_threads: 32
       group_type: 'redshift'
       data_threshold: [0,30]

From 49aa17d4a01104fb55434b7839afcbe4aa7e7397 Mon Sep 17 00:00:00 2001
From: Angus Wright <awright@astro.ruhr-uni-bochum.de>
Date: Thu, 10 Sep 2020 12:19:13 +0200
Subject: [PATCH 28/35] fixed bug in group_type

---
 tomo_challenge/classifiers/ComplexSOM.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py
index 915ba17e..5f2c8365 100644
--- a/tomo_challenge/classifiers/ComplexSOM.py
+++ b/tomo_challenge/classifiers/ComplexSOM.py
@@ -265,12 +265,16 @@ def train (self, training_data, training_z):
             props = props.rx2("data.white")
             props.rx[base.which(base.is_na(props))] = -1
             print(base.summary(props))
+        
+        if group_type=='redshift':
+            print("Groups will be constructed by redshift property")
             hclust=stats.hclust(stats.dist(props))
             cell_group=stats.cutree(hclust,k=num_groups)
-
             #Assign the cell groups to the SOM structure
             som.rx2['hclust']=hclust
             som.rx2['cell_clust']=cell_group
+        else:
+            print("Groups will be constructed by redshift property")
 
         #Construct the Nz properties per SOM group
         print("Constructing group-based redshift properties")

From 5ca0871841ddae08e02b8f5a166b3e7b0c64369b Mon Sep 17 00:00:00 2001
From: Angus Wright <awright@astro.ruhr-uni-bochum.de>
Date: Thu, 10 Sep 2020 12:20:39 +0200
Subject: [PATCH 29/35] fixed print typo

---
 tomo_challenge/classifiers/ComplexSOM.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py
index 5f2c8365..90d463b1 100644
--- a/tomo_challenge/classifiers/ComplexSOM.py
+++ b/tomo_challenge/classifiers/ComplexSOM.py
@@ -274,7 +274,7 @@ def train (self, training_data, training_z):
             som.rx2['hclust']=hclust
             som.rx2['cell_clust']=cell_group
         else:
-            print("Groups will be constructed by redshift property")
+            print("Groups will be constructed by colour property")
 
         #Construct the Nz properties per SOM group
         print("Constructing group-based redshift properties")

From 860f54e847a6c2e32deddfa62b704a13cc32dfa6 Mon Sep 17 00:00:00 2001
From: Angus Wright <awright@astro.ruhr-uni-bochum.de>
Date: Fri, 11 Sep 2020 16:50:59 +0200
Subject: [PATCH 30/35] added redshift_propset and whiten options

---
 tomo_challenge/classifiers/ComplexSOM.py | 49 +++++++++++++++++-------
 1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py
index 90d463b1..2ee47bc6 100644
--- a/tomo_challenge/classifiers/ComplexSOM.py
+++ b/tomo_challenge/classifiers/ComplexSOM.py
@@ -52,7 +52,8 @@ class ComplexSOM(Tomographer):
     # valid parameter -- see below
     valid_options = ['bins','som_dim','num_groups','num_threads',
             'group_type','data_threshold','sparse_frac','plots',
-            'plot_dir','metric','use_inbin','use_outbin','testing']
+            'plot_dir','metric','use_inbin','use_outbin','testing',
+            'whiten','redshift_propset']
     # this settings means arrays will be sent to train and apply instead
     # of dictionaries
     wants_arrays = False
@@ -117,6 +118,10 @@ def train (self, training_data, training_z):
         use_outbin = self.opt['use_outbin']
         #Plots
         plots = self.opt['plots']
+        #redshift properties to use in grouping
+        redshift_propset = self.opt['redshift_propset']
+        #Whiten the redshift data when grouping?
+        whiten = self.opt['whiten']
         #Plot Output Directory
         plot_dir = self.opt['plot_dir']
         #Group Type
@@ -131,10 +136,24 @@ def train (self, training_data, training_z):
                 'use_p_outbin': use_outbin}
 
         #Define the redshift summary statistics (used for making groups in the 'redshift' case
-        property_labels = ("mean_z_true","med_z_true","sd_z_true","mad_z_true","iqr_z_true")
-        property_expressions = ("mean(data$redshift_true)","median(data$redshift_true)","sd(data$redshift_true)",
-                                "mad(data$redshift_true)",
-                                "diff(quantile(data$redshift_true,probs=pnorm(c(-2,2))))")
+        if redshift_propset==1:
+            property_labels = StrVector(("mean_z_true","med_z_true","sd_z_true","mad_z_true","iqr_z_true"))
+            property_expressions = StrVector(("mean(data$redshift_true)","median(data$redshift_true)","sd(data$redshift_true)",
+                    "mad(data$redshift_true)",
+                    "diff(quantile(data$redshift_true,probs=pnorm(c(-2,2))))"))
+        elif redshift_propset==2: 
+            property_labels = StrVector(("mean_z_true","med_z_true","mad_z_true"))
+            property_expressions = StrVector(("mean(data$redshift_true)","median(data$redshift_true)",
+                    "mad(data$redshift_true)"))
+        elif redshift_propset==3: 
+            property_labels = StrVector(("quan_05","quan_16","quan_50","quan_84","quan_95"))
+            property_expressions = ("quantile(data$redshift_true,probs=c(5,16,50,84,95)/100)")
+        else:
+            property_labels = StrVector(("mean_z_true","med_z_true","sd_z_true","mad_z_true","iqr_z_true"))
+            property_expressions = StrVector(("mean(data$redshift_true)","median(data$redshift_true)","sd(data$redshift_true)",
+                    "mad(data$redshift_true)",
+                    "diff(quantile(data$redshift_true,probs=pnorm(c(-2,2))))"))
+        
         #Define the SOM variables
         if self.bands == 'riz':
             #riz bands
@@ -193,7 +212,6 @@ def train (self, training_data, training_z):
                                "z-y","z","u-g-(g-r)","g-r-(r-i)",
                                "r-i-(i-z)","i-z-(z-y)")
 
-
         #Define the output SOM name
         if self.opt['sizes'] == True: 
             som_outname = f"SOM_{som_dim}_{self.bands}_withsize.pkl"
@@ -255,15 +273,20 @@ def train (self, training_data, training_z):
             print("Constructing cell-based redshift properties")
             #Construct the Nz properties per SOM cell
             cell_prop=kohonen.generate_kohgroup_property(som=som,data=train_df,
-                        expression=StrVector(property_expressions),expr_label=StrVector(property_labels),returnMatrix=True)
+                        expression=property_expressions,expr_label=property_labels,returnMatrix=True)
             som=cell_prop.rx2('som')
             som.rx2['clust.classif']=FloatVector([])
             print("Constructing redshift-based hierarchical cluster tree")
             #Cluster the SOM cells into num_groups groups
-            props = kohonen.kohwhiten(cell_prop.rx2['property'],train_expr=base.colnames(cell_prop.rx2['property']),
-                    data_missing='NA',data_threshold=FloatVector([0,12]))
-            props = props.rx2("data.white")
-            props.rx[base.which(base.is_na(props))] = -1
+            if whiten==True:
+                props = kohonen.kohwhiten(cell_prop.rx2['property'],train_expr=base.colnames(cell_prop.rx2['property']),
+                        data_missing='NA',data_threshold=FloatVector([0,12]))
+                props = props.rx2("data.white")
+                props.rx[base.which(base.is_na(props))] = -10
+            else:
+                props = cell_prop.rx2['property']
+                props.rx[base.which(base.is_na(props))] = -1
+
             print(base.summary(props))
         
         if group_type=='redshift':
@@ -279,13 +302,13 @@ def train (self, training_data, training_z):
         #Construct the Nz properties per SOM group
         print("Constructing group-based redshift properties")
         group_prop=kohonen.generate_kohgroup_property(som=som,data=train_df,
-            expression=StrVector(property_expressions),expr_label=StrVector(property_labels),
+            expression=property_expressions,expr_label=property_labels,
             n_cluster_bins=num_groups,returnMatrix=True)
 
         #extract the training som (just for convenience)
         train_som = group_prop.rx2('som')
 
-        if plots == True: 
+        if plots == True and redshift_propset==1: 
             #Make the diagnostic plots
             props = group_prop.rx2('property')
             cprops = cell_prop.rx2('property')

From 21510aa32640d4e3a87db413622f0c73f35c9ef3 Mon Sep 17 00:00:00 2001
From: David Alonso <dam.phys@gmail.com>
Date: Fri, 11 Sep 2020 17:45:49 +0100
Subject: [PATCH 31/35] update

---
 the_atlantics/optimizer.py               |  92 +++++++++-----
 the_atlantics/snc.py                     | 148 +++++++++++++++++++----
 tomo_challenge/classifiers/ComplexSOM.py |  50 +++++---
 tomo_challenge/snc.py                    | 148 +++++++++++++++++++----
 4 files changed, 338 insertions(+), 100 deletions(-)

diff --git a/the_atlantics/optimizer.py b/the_atlantics/optimizer.py
index d64bedc4..f46dd508 100644
--- a/the_atlantics/optimizer.py
+++ b/the_atlantics/optimizer.py
@@ -4,7 +4,6 @@
 from scipy.interpolate import interp1d
 from scipy.optimize import minimize, brentq
 from scipy.integrate import simps
-import matplotlib.pyplot as plt
 
 
 parser = ArgumentParser()
@@ -12,6 +11,8 @@
                     help="Path to input file containing the N(z) of all initial groups")
 parser.add_argument("--prefix-out", default='out', type=str,
                     help="Prefix for all output files")
+parser.add_argument("--suffix-out", default='', type=str,
+                    help="Suffix for all output files")
 parser.add_argument("--n-bins", default=4, type=int,
                     help="Number of bins (excluding a possible trash bin")
 parser.add_argument("--prob-in-bin", default=-1., type=float,
@@ -20,6 +21,16 @@
 parser.add_argument("--prob-out-bin", default=-1., type=float,
                     help="Maximum fraction of the N(z) that should be inside other bins."
                     "If <= 0, this will not be taken into account in the optimization.")
+parser.add_argument('--in-bin-free', default=False, action='store_true',
+                    help='Free up in-bin probability (default: False)')
+parser.add_argument('--use-clustering', default=False, action='store_true',
+                    help='Use clustering (default: False)')
+parser.add_argument('--use-3x2', default=False, action='store_true',
+                    help='Use 3x2 (default: False)')
+parser.add_argument('--random-start', default=False, action='store_true',
+                    help='Use random starting points (default: False)')
+parser.add_argument('--random-seed', default=1234, type=int,
+                    help='Random seed')
 o = parser.parse_args()
 
 nzs = np.load(o.nz_file)
@@ -38,20 +49,41 @@
 nzs = nzs * ng_tot / ng_tot_small
 
 # Initialize calculator
-sc = SnCalc(zz, nzs, fsky=fsky)
-sc.get_cl_matrix(fname_save=o.prefix_out + 'cl_wl.npz')
+sc = SnCalc(zz, nzs, fsky=fsky, use_clustering=o.use_clustering, use_3x2=o.use_3x2)
+if o.use_3x2:
+    suffix = 'cl_3x2.npz'
+else:
+    if o.use_clustering:
+        suffix = 'cl_gc.npz'
+    else:
+        suffix = 'cl_wl.npz'
+sc.get_cl_matrix(fname_save=o.prefix_out + suffix)
 
 # Initial edge guess (defined as having equal number of galaxies)
-nz_tot = np.sum(nzs, axis=0)
-cumulative_fraction = np.cumsum(nz_tot) / np.sum(nz_tot)
-cumul_f = interp1d(zz, cumulative_fraction, bounds_error=False,
-                   fill_value=(0, 1))
-edges_0 = np.array([brentq(lambda z : cumul_f(z) - q, 0, 2)
-                    for q in (np.arange(o.n_bins-1)+1.)/o.n_bins])
-
+if o.random_start:
+    np.random.seed(o.random_seed)
+    z_end = 1.5
+    delta_z = z_end / (o.n_bins-1)
+    edges_0 = delta_z * (np.arange(o.n_bins-1) + np.random.rand(o.n_bins-1))
+else:
+    nz_tot = np.sum(nzs, axis=0)
+    cumulative_fraction = np.cumsum(nz_tot) / np.sum(nz_tot)
+    cumul_f = interp1d(zz, cumulative_fraction, bounds_error=False,
+                       fill_value=(0, 1))
+    edges_0 = np.array([brentq(lambda z : cumul_f(z) - q, 0, 2)
+                        for q in (np.arange(o.n_bins-1)+1.)/o.n_bins])
+
+if o.in_bin_free:
+    if o.random_start:
+        pin = np.random.rand()
+    else:
+        pin = 0.5
+    edges_0 = np.concatenate((np.array([pin]), edges_0))
+print(edges_0)
 
 # Minimize
 # Binning parameters
+
 params = assign_params_default.copy()
 if o.prob_in_bin > 0:
     params['p_inbin_thr'] = o.prob_in_bin
@@ -59,15 +91,29 @@
 if o.prob_out_bin > 0:
     params['p_outbin_thr'] = o.prob_out_bin
     params['use_p_outbin'] = True
+if o.in_bin_free:
+    params['use_p_inbin'] = True
+    params['p_inbin_thr'] = edges_0[0]
 
 
 def minus_sn(edges, calc):
-    return -calc.get_sn_from_edges(edges, assign_params=params)
+    if o.in_bin_free:
+        if (edges[0] < 0) or (edges[0] > 1):
+            return 0.
+        params['p_inbin_thr'] = edges[0]
+        edges_use = edges[1:]
+    else:
+        edges_use = edges
+    return -calc.get_sn_from_edges(edges_use, assign_params=params)
 
     
 # Actual optimization
 res = minimize(minus_sn, edges_0, method='Powell', args=(sc,))
-edges_1 = res.x
+if o.in_bin_free:
+    edges_1 = res.x[1:]
+    params['p_inbin_thr'] = res.x[0]
+else:
+    edges_1 = res.x
 
 # Post-process:
 # S/N
@@ -92,26 +138,16 @@ def get_bin_name(i):
 
 d_out = {'sn': sn,
          'edges': edges_1}
-plt.figure()
+if o.in_bin_free:
+    d_out['p_in'] = params['p_inbin_thr']
+    print("P_in: ", params['p_inbin_thr'])
 for (i, a), n in zip(assign, nz_best):
     name = get_bin_name(i)
     d_out[name+'_groups'] = a
     d_out[name+'_nz'] = n
-    print("Bin %d, groups: " % i, a)
     n_here = simps(n, x=zz)
     f = n_here / n_tot
     frac = "%.1lf" % (100 * f)
-    if i == -1:
-        bin_name = 'Trash bin'
-    else:
-        bin_name = f'Bin {i}'
-    plt.plot(zz, n/n_here,
-             label=bin_name + f' ({frac} %)')
-plt.legend()
-plt.xlabel(r'$z$', fontsize=15)
-plt.ylabel(r'$p(z)$', fontsize=15)
-plt.savefig(o.prefix_out + '_nz_summary.png',
-            bbox_inches='tight')
-
-np.savez(o.prefix_out + '_bin_info.npz', **d_out)
-plt.show()
+    print("Bin %d, groups: " % i, a, frac)
+
+np.savez(o.prefix_out + o.suffix_out + '_bin_info.npz', **d_out)
diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py
index db83e1c8..24fed447 100644
--- a/the_atlantics/snc.py
+++ b/the_atlantics/snc.py
@@ -13,8 +13,9 @@
 class SnCalc(object):
     edges_large = 3.
 
-    def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
-                 s_gamma=0.26, use_clustering=False, integrator='spline'):
+    def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, n_ell=100,
+                 s_gamma=0.26, use_clustering=False, use_3x2=False,
+                 integrator='qag_quad'):#spline'):
         """ S/N calculator
         Args:
             z_arr (array_like): array of redshifts at which all the N(z)s are
@@ -32,6 +33,7 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
                 `use_clustering=False`).
             use_clustering (bool): if `True`, SNR will be computed for
                 clustering instead of lensing.
+            use_3x2 (bool): if `True`, SNR will be computed for 3x2pt.
             integrator (string): CCL integration method. Either 'qag_quad'
                 or 'spline'.
         """
@@ -39,9 +41,10 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
         self.s_gamma = s_gamma
         self.fsky = fsky
         self.lmax = lmax
-        self.d_ell = d_ell
-        self.larr = np.arange(2, lmax, d_ell)+d_ell/2
-        self.n_ell = len(self.larr)
+        self.n_ell = n_ell
+        ell_edges = np.logspace(2, np.log10(lmax), n_ell+1)
+        self.larr = 0.5*(ell_edges[1:]+ell_edges[:-1])
+        self.d_ell = (ell_edges[1:]-ell_edges[:-1])
         self.n_samples = len(nz_list)
         self.z_arr = z_arr
         self.nz_list = nz_list
@@ -53,9 +56,14 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
         self.n_dens = self.n_gals / (4*np.pi*self.fsky)
         self.cls = None
         self.use_clustering = use_clustering
+        self.use_3x2 = use_3x2
+        if use_3x2:
+            self.n_tracers = 2*self.n_samples
+        else:
+            self.n_tracers = self.n_samples
 
     def _bz_model(self, cosmo, z):
-        return 0.95/ccl.growth_factor(cosmo, 1./(1+z))
+        return 1./ccl.growth_factor(cosmo, 1./(1+z))
 
     def get_cl_matrix(self, fname_save=None, recompute=False):
         """ Computes matrix of power spectra between all the initial groups.
@@ -79,20 +87,27 @@ def get_cl_matrix(self, fname_save=None, recompute=False):
                               sigma8=0.81)
 
         # Tracers
-        if self.use_clustering:
-            trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz),
-                                          bias=(self.z_arr,
-                                                self._bz_model(cosmo,
-                                                               self.z_arr)))
-                   for nz in self.nz_list]
+        bz = self._bz_model(cosmo, self.z_arr)
+        if self.use_3x2:
+            trs_gc = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz),
+                                             bias=(self.z_arr, bz))
+                      for nz in self.nz_list]
+            trs_wl = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz))
+                      for nz in self.nz_list]
+            trs = trs_gc + trs_wl
         else:
-            trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz))
-                   for nz in self.nz_list]
+            if self.use_clustering:
+                trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz),
+                                              bias=(self.z_arr, bz))
+                       for nz in self.nz_list]
+            else:
+                trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz))
+                       for nz in self.nz_list]
 
         # Cls
-        self.cls = np.zeros([self.n_ell, self.n_samples, self.n_samples])
-        for i in range(self.n_samples):
-            for j in range(i, self.n_samples):
+        self.cls = np.zeros([self.n_ell, self.n_tracers, self.n_tracers])
+        for i in range(self.n_tracers):
+            for j in range(i, self.n_tracers):
                 cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr,
                                     limber_integration_method=self.integrator)
                 self.cls[:, i, j] = cl
@@ -126,22 +141,99 @@ def _get_cl_resamples(self, weights):
         """
         if self.cls is None:
             self.get_cl_matrix()
-        return np.einsum('jl,km,ilm', weights, weights, self.cls)
+        if self.use_3x2:
+            w_use = np.tile(weights, (2, 2))
+        else:
+            w_use = weights
+        return np.sum(np.sum(w_use[None, None, :, :] *
+                             self.cls[:, :, None, :],
+                             axis=-1)[:, None, :, :] *
+                      w_use[None, :, :, None],
+                      axis=-2)
 
     def _get_nl_resamples(self, n_dens):
         """ Gets noise contribution to C_ell matrix for resampled groups
         from list of number densities.
         """
-        if self.use_clustering:
-            return np.diag(1./n_dens)[None, :, :]
+        if self.use_3x2:
+            n_diag = np.array(list(1./n_dens) +
+                              list(self.s_gamma**2/n_dens))
+            return np.diag(n_diag)[None, :, :]
         else:
-            return np.diag(self.s_gamma**2/n_dens)[None, :, :]
+            if self.use_clustering:
+                return np.diag(1./n_dens)[None, :, :]
+            else:
+                return np.diag(self.s_gamma**2/n_dens)[None, :, :]
+
+    def get_sn_max(self, full_output=False):
+        """ Compute maximum signal-to-noise ratio given the input
+        groups
+        Args:
+            full_output (bool): if true, a dictionary with additional
+                information will be returned. Otherwise just total S/N.
+        Returns:
+            If `full_output=True`, dictionary containing S/N, power
+            spectra and noise spectra. Otherwise just S/N.
+        """
+        weights = np.eye(self.n_samples)
+        n_dens = self.n_dens
+        return self.get_sn_wn(weights, n_dens, full_output=full_output)
+
+    def get_kltrans(self):
+        """ Compute KL decomposition.
+
+        Returns:
+            Fisher matrix element for each of the KL modes.
+        """
+        nl = self._get_nl_resamples(self.n_dens)
+        nij = nl*np.ones(self.n_ell)[:, None, None]
+        cij = self.cls + nij
+        sij = cij - nij
+        inv_nij = np.linalg.inv(nij)
+        metric = inv_nij
+
+        def change_basis(c, m, ev):
+            return np.array([np.diag(np.dot(ev[i].T,
+                                            np.dot(m[i],
+                                                   np.dot(c[i],
+                                                          np.dot(m[i],
+                                                                 ev[i])))))
+                             for i in range(self.n_ell)])
+
+        def diagonalize(c, m):
+            im = np.linalg.inv(m)
+            ll = np.linalg.cholesky(m)
+            ill = np.linalg.cholesky(im)
+            cl = np.array([np.dot(np.transpose(ll[i]),
+                                  np.dot(c[i], ll[i]))
+                           for i in range(self.n_ell)])
+            c_p, v = np.linalg.eigh(cl)
+            ev = np.array([np.dot(np.transpose(ill[i]), v[i])
+                           for i in range(self.n_ell)])
+            # iden = change_basis(im, m, ev)
+            return ev, c_p
+
+        e_v, c_p = diagonalize(cij, metric)
+        s_p = change_basis(sij, metric, e_v)
+        nmodes_l = (self.fsky * self.d_ell * (self.larr+0.5))
+        fish_kl = nmodes_l[:, None]*(s_p/c_p)**2
+        isort = np.argsort(-np.sum(fish_kl, axis=0))
+        e_o = e_v[:, :, isort]
+        # f_o = np.array([np.dot(inv_nij[l], e_o[l, :, :])
+        #                 for l in range(self.n_ell)])
+        c_p = change_basis(cij, metric, e_o)
+        s_p = change_basis(sij, metric, e_o)
+
+        fish_kl = nmodes_l[:, None]*(s_p/c_p)**2
+        fish_permode = np.sum(fish_kl, axis=0)
+        fish_cumul = np.cumsum(fish_permode)
+        return fish_kl, fish_permode, fish_cumul
 
     def get_sn_wn(self, weights, n_dens, full_output=False):
         """ Compute signal-to-noise ratio from weights matrix and number
         densities of the resampled groups.
         Args:
-                        weights (array_like): weights matrix of shape
+            weights (array_like): weights matrix of shape
                 `(N_resample, N_initial)`, where `N_resample` is the
                 number of new groups, and `N_initial` is the original
                 number of groups. Each entry corresponds to the weight
@@ -152,7 +244,7 @@ def get_sn_wn(self, weights, n_dens, full_output=False):
             full_output (bool): if true, a dictionary with additional
                 information will be returned. Otherwise just total S/N.
         Returns:
-                        If `full_output=True`, dictionary containing S/N, power
+            If `full_output=True`, dictionary containing S/N, power
             spectra and noise spectra. Otherwise just S/N.
         """
         sl = self._get_cl_resamples(weights)
@@ -163,8 +255,8 @@ def get_sn_wn(self, weights, n_dens, full_output=False):
         sn2_ell = np.sum(sn2_1pt[:, :, :, None] * sn2_1pt[:, None, :, :],
                          axis=2)
         trsn2_ell = np.trace(sn2_ell, axis1=1, axis2=2)
-        snr = np.sqrt(np.sum(trsn2_ell * (2*self.larr + 1) *
-                             self.d_ell / self.fsky))
+        snr = np.sqrt(np.sum(trsn2_ell * (self.larr + 0.5) *
+                             self.d_ell * self.fsky))
 
         if full_output:
             dret = {'snr': snr, 'cl': sl, 'nl': nl, 'ls': self.larr}
@@ -192,9 +284,12 @@ def get_sn(self, assign, full_output=False):
     def check_edges(self, edges):
         """ Returns `True` if there's something wrong with the edges.
         """
+        swapped = False
+        if np.ndim(edges) > 0:
+            swapped = np.any(np.diff(edges) < 0)
         return np.any(edges < 0) or \
             np.any(edges > self.edges_large) or \
-            np.any(np.diff(edges) < 0)
+            swapped
 
     def get_nzs_from_edges(self, edges, assign_params=assign_params_default):
         assign = self.assign_from_edges(edges, assign_params=assign_params)
@@ -215,6 +310,7 @@ def assign_from_edges(self, edges, assign_params=assign_params_default,
         Returns:
             List of assignment arrays ready to be used in e.g. `get_sn`.
         """
+        edges = np.atleast_1d(edges)
         nbins = len(edges) + 1
         # Bin IDs based on mean z
         ids = np.digitize(self.z_means, bins=edges)
diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py
index d83d3d7f..986c3f8e 100644
--- a/tomo_challenge/classifiers/ComplexSOM.py
+++ b/tomo_challenge/classifiers/ComplexSOM.py
@@ -23,8 +23,9 @@
 from rpy2.robjects import pandas2ri
 from rpy2.robjects.conversion import localconverter
 from ..snc import SnCalc 
-from scipy.optimize import minimize
-from scipy.integrate import simps 
+from scipy.interpolate import interp1d
+from scipy.optimize import minimize, brentq
+from scipy.integrate import simps
 import pickle
 
 #Check that all the needed packages are installed
@@ -126,9 +127,9 @@ def train (self, training_data, training_z):
             group_type = 'redshift'
         #Define the assign_params variable, used in bin optimisation 
         assign_params = {'p_inbin_thr': 0.5,
-                'p_outbin_thr': 0.2,
-                'use_p_inbin': use_inbin,
-                'use_p_outbin': use_outbin}
+                         'p_outbin_thr': 0.2,
+                         'use_p_inbin': use_inbin,
+                         'use_p_outbin': use_outbin}
 
         #Define the redshift summary statistics (used for making groups in the 'redshift' case
         property_labels = ("mean_z_true","med_z_true","sd_z_true","mad_z_true","iqr_z_true")
@@ -303,8 +304,8 @@ def train (self, training_data, training_z):
         z_arr = np.linspace(0, 2, 124)
         z_cen = z_arr[0:-1]+(z_arr[1]-z_arr[0])/2. 
         #Construct the per-group Nz
-        nzs = [(np.histogram(training_z[train_group == group+1],z_arr)[0]) for group in np.arange(num_groups)[np.array(tab)!=0]]
-       
+        nzs = [(np.histogram(training_z[train_group == group+1],z_arr)[0])
+               for group in np.arange(num_groups)[np.array(tab)!=0]]
         #np.save('plots/nzs.npy', nzs)
        
         #Update the fsky
@@ -315,35 +316,42 @@ def train (self, training_data, training_z):
         area_arcmin = (180*60/np.pi)**2*4*np.pi*fsky
         ng_tot_goal = ndens_arcmin * area_arcmin
         ng_tot_curr = np.sum(np.array(tab))
-        nzs = [nz * ng_tot_goal / ng_tot_curr for nz in nzs]
-
+        nzs = np.array([nz * ng_tot_goal / ng_tot_curr for nz in nzs])
         #np.save('plots/nzs_norm.npy', nzs)
-       
-        #print(nzs)
     
         # Now initialize a S/N calculator for these initial groups.
         os.system('rm wl_nb%d.npz' % n_bin)
         if metric == 'SNR_ww': 
-            c_wl = SnCalc(z_cen, nzs, use_clustering=False,fsky=fsky)
+            c_wl = SnCalc(z_cen, nzs, use_clustering=False, fsky=fsky)
         else:
             c_wl = SnCalc(z_cen, nzs, use_clustering=True,fsky=fsky)
-        print("Initializing WL")
+        print("Initializing Cls")
         c_wl.get_cl_matrix(fname_save='wl_nb%d.npz' % n_bin)
+
+        # Initial edge guess
+        nz_tot = np.sum(nzs, axis=0)
+        cumulative_fraction = np.cumsum(nz_tot) / np.sum(nz_tot)
+        cumul_f = interp1d(zz, cumulative_fraction, bounds_error=False,
+                           fill_value=(0, 1))
+        edges_0 = np.array([brentq(lambda z : cumul_f(z) - q, 0, 2)
+                            for q in (np.arange(o.n_bins-1)+1.)/o.n_bins])
+        
         # Finally, let's write the function to minimize and optimize for a 4-bin case.
         def minus_sn(edges, calc):
             return -calc.get_sn_from_edges(edges,assign_params=assign_params)
 
         print("Optimizing WL")
-        edges_0 = np.linspace(0, 2, n_bin-1)
         res = minimize(minus_sn, edges_0, method='Powell', args=(c_wl,))
-        print("WL final edges: ", res.x)
-        print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x,assign_params=assign_params)*np.sqrt(0.25/fsky))
+        print("Final edges: ", res.x)
+        print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x,
+                                                      assign_params=assign_params))
         print(" ")
-        print(res)
 
         #Construct the per-group Nz
         print("Outputting trained SOM and redshift ordering of SOM groups")
-        groups_in_tomo = c_wl.assign_from_edges(res.x, get_ids=True)
+        groups_in_tomo = c_wl.assign_from_edges(res.x,
+                                                assign_params=assign_params,
+                                                get_ids=True)
         group_bins = np.zeros(num_groups)
         for bin_no, groups in groups_in_tomo:
             print(bin_no, groups)
@@ -392,8 +400,10 @@ def apply (self, data):
 
         #Assign the sources, by group, to tomographic bins
         print("Output source tomographic bin assignments")
-        valid_bin = base.unsplit(IntVector(self.group_bins),FactorVector(valid_som.rx2['clust.classif'],
-            levels=base.seq(num_groups)),drop=False)
+        valid_bin = base.unsplit(IntVector(self.group_bins),
+                                 FactorVector(valid_som.rx2['clust.classif'],
+                                              levels=base.seq(num_groups)),
+                                 drop=False)
         valid_bin = np.array(valid_bin)
 
         return valid_bin
diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py
index db83e1c8..24fed447 100644
--- a/tomo_challenge/snc.py
+++ b/tomo_challenge/snc.py
@@ -13,8 +13,9 @@
 class SnCalc(object):
     edges_large = 3.
 
-    def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
-                 s_gamma=0.26, use_clustering=False, integrator='spline'):
+    def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, n_ell=100,
+                 s_gamma=0.26, use_clustering=False, use_3x2=False,
+                 integrator='qag_quad'):#spline'):
         """ S/N calculator
         Args:
             z_arr (array_like): array of redshifts at which all the N(z)s are
@@ -32,6 +33,7 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
                 `use_clustering=False`).
             use_clustering (bool): if `True`, SNR will be computed for
                 clustering instead of lensing.
+            use_3x2 (bool): if `True`, SNR will be computed for 3x2pt.
             integrator (string): CCL integration method. Either 'qag_quad'
                 or 'spline'.
         """
@@ -39,9 +41,10 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
         self.s_gamma = s_gamma
         self.fsky = fsky
         self.lmax = lmax
-        self.d_ell = d_ell
-        self.larr = np.arange(2, lmax, d_ell)+d_ell/2
-        self.n_ell = len(self.larr)
+        self.n_ell = n_ell
+        ell_edges = np.logspace(2, np.log10(lmax), n_ell+1)
+        self.larr = 0.5*(ell_edges[1:]+ell_edges[:-1])
+        self.d_ell = (ell_edges[1:]-ell_edges[:-1])
         self.n_samples = len(nz_list)
         self.z_arr = z_arr
         self.nz_list = nz_list
@@ -53,9 +56,14 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10,
         self.n_dens = self.n_gals / (4*np.pi*self.fsky)
         self.cls = None
         self.use_clustering = use_clustering
+        self.use_3x2 = use_3x2
+        if use_3x2:
+            self.n_tracers = 2*self.n_samples
+        else:
+            self.n_tracers = self.n_samples
 
     def _bz_model(self, cosmo, z):
-        return 0.95/ccl.growth_factor(cosmo, 1./(1+z))
+        return 1./ccl.growth_factor(cosmo, 1./(1+z))
 
     def get_cl_matrix(self, fname_save=None, recompute=False):
         """ Computes matrix of power spectra between all the initial groups.
@@ -79,20 +87,27 @@ def get_cl_matrix(self, fname_save=None, recompute=False):
                               sigma8=0.81)
 
         # Tracers
-        if self.use_clustering:
-            trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz),
-                                          bias=(self.z_arr,
-                                                self._bz_model(cosmo,
-                                                               self.z_arr)))
-                   for nz in self.nz_list]
+        bz = self._bz_model(cosmo, self.z_arr)
+        if self.use_3x2:
+            trs_gc = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz),
+                                             bias=(self.z_arr, bz))
+                      for nz in self.nz_list]
+            trs_wl = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz))
+                      for nz in self.nz_list]
+            trs = trs_gc + trs_wl
         else:
-            trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz))
-                   for nz in self.nz_list]
+            if self.use_clustering:
+                trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz),
+                                              bias=(self.z_arr, bz))
+                       for nz in self.nz_list]
+            else:
+                trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz))
+                       for nz in self.nz_list]
 
         # Cls
-        self.cls = np.zeros([self.n_ell, self.n_samples, self.n_samples])
-        for i in range(self.n_samples):
-            for j in range(i, self.n_samples):
+        self.cls = np.zeros([self.n_ell, self.n_tracers, self.n_tracers])
+        for i in range(self.n_tracers):
+            for j in range(i, self.n_tracers):
                 cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr,
                                     limber_integration_method=self.integrator)
                 self.cls[:, i, j] = cl
@@ -126,22 +141,99 @@ def _get_cl_resamples(self, weights):
         """
         if self.cls is None:
             self.get_cl_matrix()
-        return np.einsum('jl,km,ilm', weights, weights, self.cls)
+        if self.use_3x2:
+            w_use = np.tile(weights, (2, 2))
+        else:
+            w_use = weights
+        return np.sum(np.sum(w_use[None, None, :, :] *
+                             self.cls[:, :, None, :],
+                             axis=-1)[:, None, :, :] *
+                      w_use[None, :, :, None],
+                      axis=-2)
 
     def _get_nl_resamples(self, n_dens):
         """ Gets noise contribution to C_ell matrix for resampled groups
         from list of number densities.
         """
-        if self.use_clustering:
-            return np.diag(1./n_dens)[None, :, :]
+        if self.use_3x2:
+            n_diag = np.array(list(1./n_dens) +
+                              list(self.s_gamma**2/n_dens))
+            return np.diag(n_diag)[None, :, :]
         else:
-            return np.diag(self.s_gamma**2/n_dens)[None, :, :]
+            if self.use_clustering:
+                return np.diag(1./n_dens)[None, :, :]
+            else:
+                return np.diag(self.s_gamma**2/n_dens)[None, :, :]
+
+    def get_sn_max(self, full_output=False):
+        """ Compute maximum signal-to-noise ratio given the input
+        groups
+        Args:
+            full_output (bool): if true, a dictionary with additional
+                information will be returned. Otherwise just total S/N.
+        Returns:
+            If `full_output=True`, dictionary containing S/N, power
+            spectra and noise spectra. Otherwise just S/N.
+        """
+        weights = np.eye(self.n_samples)
+        n_dens = self.n_dens
+        return self.get_sn_wn(weights, n_dens, full_output=full_output)
+
+    def get_kltrans(self):
+        """ Compute KL decomposition.
+
+        Returns:
+            Fisher matrix element for each of the KL modes.
+        """
+        nl = self._get_nl_resamples(self.n_dens)
+        nij = nl*np.ones(self.n_ell)[:, None, None]
+        cij = self.cls + nij
+        sij = cij - nij
+        inv_nij = np.linalg.inv(nij)
+        metric = inv_nij
+
+        def change_basis(c, m, ev):
+            return np.array([np.diag(np.dot(ev[i].T,
+                                            np.dot(m[i],
+                                                   np.dot(c[i],
+                                                          np.dot(m[i],
+                                                                 ev[i])))))
+                             for i in range(self.n_ell)])
+
+        def diagonalize(c, m):
+            im = np.linalg.inv(m)
+            ll = np.linalg.cholesky(m)
+            ill = np.linalg.cholesky(im)
+            cl = np.array([np.dot(np.transpose(ll[i]),
+                                  np.dot(c[i], ll[i]))
+                           for i in range(self.n_ell)])
+            c_p, v = np.linalg.eigh(cl)
+            ev = np.array([np.dot(np.transpose(ill[i]), v[i])
+                           for i in range(self.n_ell)])
+            # iden = change_basis(im, m, ev)
+            return ev, c_p
+
+        e_v, c_p = diagonalize(cij, metric)
+        s_p = change_basis(sij, metric, e_v)
+        nmodes_l = (self.fsky * self.d_ell * (self.larr+0.5))
+        fish_kl = nmodes_l[:, None]*(s_p/c_p)**2
+        isort = np.argsort(-np.sum(fish_kl, axis=0))
+        e_o = e_v[:, :, isort]
+        # f_o = np.array([np.dot(inv_nij[l], e_o[l, :, :])
+        #                 for l in range(self.n_ell)])
+        c_p = change_basis(cij, metric, e_o)
+        s_p = change_basis(sij, metric, e_o)
+
+        fish_kl = nmodes_l[:, None]*(s_p/c_p)**2
+        fish_permode = np.sum(fish_kl, axis=0)
+        fish_cumul = np.cumsum(fish_permode)
+        return fish_kl, fish_permode, fish_cumul
 
     def get_sn_wn(self, weights, n_dens, full_output=False):
         """ Compute signal-to-noise ratio from weights matrix and number
         densities of the resampled groups.
         Args:
-                        weights (array_like): weights matrix of shape
+            weights (array_like): weights matrix of shape
                 `(N_resample, N_initial)`, where `N_resample` is the
                 number of new groups, and `N_initial` is the original
                 number of groups. Each entry corresponds to the weight
@@ -152,7 +244,7 @@ def get_sn_wn(self, weights, n_dens, full_output=False):
             full_output (bool): if true, a dictionary with additional
                 information will be returned. Otherwise just total S/N.
         Returns:
-                        If `full_output=True`, dictionary containing S/N, power
+            If `full_output=True`, dictionary containing S/N, power
             spectra and noise spectra. Otherwise just S/N.
         """
         sl = self._get_cl_resamples(weights)
@@ -163,8 +255,8 @@ def get_sn_wn(self, weights, n_dens, full_output=False):
         sn2_ell = np.sum(sn2_1pt[:, :, :, None] * sn2_1pt[:, None, :, :],
                          axis=2)
         trsn2_ell = np.trace(sn2_ell, axis1=1, axis2=2)
-        snr = np.sqrt(np.sum(trsn2_ell * (2*self.larr + 1) *
-                             self.d_ell / self.fsky))
+        snr = np.sqrt(np.sum(trsn2_ell * (self.larr + 0.5) *
+                             self.d_ell * self.fsky))
 
         if full_output:
             dret = {'snr': snr, 'cl': sl, 'nl': nl, 'ls': self.larr}
@@ -192,9 +284,12 @@ def get_sn(self, assign, full_output=False):
     def check_edges(self, edges):
         """ Returns `True` if there's something wrong with the edges.
         """
+        swapped = False
+        if np.ndim(edges) > 0:
+            swapped = np.any(np.diff(edges) < 0)
         return np.any(edges < 0) or \
             np.any(edges > self.edges_large) or \
-            np.any(np.diff(edges) < 0)
+            swapped
 
     def get_nzs_from_edges(self, edges, assign_params=assign_params_default):
         assign = self.assign_from_edges(edges, assign_params=assign_params)
@@ -215,6 +310,7 @@ def assign_from_edges(self, edges, assign_params=assign_params_default,
         Returns:
             List of assignment arrays ready to be used in e.g. `get_sn`.
         """
+        edges = np.atleast_1d(edges)
         nbins = len(edges) + 1
         # Bin IDs based on mean z
         ids = np.digitize(self.z_means, bins=edges)

From 27ed0ddb46d1d73dded2eef4dd81036df5736065 Mon Sep 17 00:00:00 2001
From: David Alonso <dam.phys@gmail.com>
Date: Fri, 11 Sep 2020 17:48:34 +0100
Subject: [PATCH 32/35] try

---
 the_atlantics/snc.py  | 7 +++++--
 tomo_challenge/snc.py | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py
index 24fed447..4c1d537e 100644
--- a/the_atlantics/snc.py
+++ b/the_atlantics/snc.py
@@ -108,8 +108,11 @@ def get_cl_matrix(self, fname_save=None, recompute=False):
         self.cls = np.zeros([self.n_ell, self.n_tracers, self.n_tracers])
         for i in range(self.n_tracers):
             for j in range(i, self.n_tracers):
-                cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr,
-                                    limber_integration_method=self.integrator)
+                try:
+                    cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr,
+                                        limber_integration_method=self.integrator)
+                except:
+                    cl = np.zeros(len(self.larr))
                 self.cls[:, i, j] = cl
                 if j != i:
                     self.cls[:, j, i] = cl
diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py
index 24fed447..4c1d537e 100644
--- a/tomo_challenge/snc.py
+++ b/tomo_challenge/snc.py
@@ -108,8 +108,11 @@ def get_cl_matrix(self, fname_save=None, recompute=False):
         self.cls = np.zeros([self.n_ell, self.n_tracers, self.n_tracers])
         for i in range(self.n_tracers):
             for j in range(i, self.n_tracers):
-                cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr,
-                                    limber_integration_method=self.integrator)
+                try:
+                    cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr,
+                                        limber_integration_method=self.integrator)
+                except:
+                    cl = np.zeros(len(self.larr))
                 self.cls[:, i, j] = cl
                 if j != i:
                     self.cls[:, j, i] = cl

From 27d519c839eb2bcbeab0d02dd058a808b071eaba Mon Sep 17 00:00:00 2001
From: David Alonso <dam.phys@gmail.com>
Date: Fri, 11 Sep 2020 18:33:17 +0100
Subject: [PATCH 33/35] fix zz

---
 tomo_challenge/classifiers/ComplexSOM.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py
index 08f232dd..ad4d0a5b 100644
--- a/tomo_challenge/classifiers/ComplexSOM.py
+++ b/tomo_challenge/classifiers/ComplexSOM.py
@@ -392,7 +392,7 @@ def train (self, training_data, training_z):
         # Initial edge guess
         nz_tot = np.sum(nzs, axis=0)
         cumulative_fraction = np.cumsum(nz_tot) / np.sum(nz_tot)
-        cumul_f = interp1d(zz, cumulative_fraction, bounds_error=False,
+        cumul_f = interp1d(z_cen, cumulative_fraction, bounds_error=False,
                            fill_value=(0, 1))
         edges_0 = np.array([brentq(lambda z : cumul_f(z) - q, 0, 2)
                             for q in (np.arange(o.n_bins-1)+1.)/o.n_bins])

From e79cfab85edcfc00687d18dd77061aba86e1109e Mon Sep 17 00:00:00 2001
From: Angus Wright <awright@astro.ruhr-uni-bochum.de>
Date: Mon, 14 Sep 2020 15:41:19 +0200
Subject: [PATCH 34/35] updated complexsom for sizes inclusion

---
 example/ComplexSOM.yaml                  | 6 ++++--
 tomo_challenge/classifiers/ComplexSOM.py | 7 +++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/example/ComplexSOM.yaml b/example/ComplexSOM.yaml
index e6aea060..0171d217 100644
--- a/example/ComplexSOM.yaml
+++ b/example/ComplexSOM.yaml
@@ -11,12 +11,14 @@ run:
   ComplexSOM:
     run_5:
       # These settings are sent to the classifier
-      testing: True
+      testing: False
       bins: 5
       som_dim: [51,51]
       num_groups: 100
       num_threads: 32
       group_type: 'redshift'
+      whiten: False
+      redshift_propset: 2
       data_threshold: [0,30]
       sparse_frac: 1.0
       plots: True 
@@ -29,7 +31,7 @@ run:
       # as well as the magnitudes
       colors: False
       errors: False
-      sizes: False
+      sizes: True
     run_10:
       # These settings are sent to the classifier
       testing: False
diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py
index 2ee47bc6..1d19fd0c 100644
--- a/tomo_challenge/classifiers/ComplexSOM.py
+++ b/tomo_challenge/classifiers/ComplexSOM.py
@@ -148,6 +148,8 @@ def train (self, training_data, training_z):
         elif redshift_propset==3: 
             property_labels = StrVector(("quan_05","quan_16","quan_50","quan_84","quan_95"))
             property_expressions = ("quantile(data$redshift_true,probs=c(5,16,50,84,95)/100)")
+            property_labels = StrVector(("quan_05","quan_16","quan_84","quan_95"))
+            property_expressions = ("quantile(data$redshift_true,probs=c(5,16,84,95)/100)")
         else:
             property_labels = StrVector(("mean_z_true","med_z_true","sd_z_true","mad_z_true","iqr_z_true"))
             property_expressions = StrVector(("mean(data$redshift_true)","median(data$redshift_true)","sd(data$redshift_true)",
@@ -214,7 +216,7 @@ def train (self, training_data, training_z):
 
         #Define the output SOM name
         if self.opt['sizes'] == True: 
-            som_outname = f"SOM_{som_dim}_{self.bands}_withsize.pkl"
+            som_outname = f"SOM_{som_dim}_{self.bands}_withsizelin.pkl"
         else:
             som_outname = f"SOM_{som_dim}_{self.bands}.pkl"
 
@@ -257,9 +259,6 @@ def train (self, training_data, training_z):
                             n_cores=num_threads,train_expr=StrVector(expressions),train_sparse=False,sparse_frac=sparse_frac)
                 print(base.length(som.rx2['unit.classif']))
                 #Output the SOM 
-                #base.save(som,file=som_outname)
-                #with open(f"test_{som_outname}", 'wb') as f:
-                #    pickle.dump(som, f)
                 with open(som_outname, 'wb') as f:
                     pickle.dump(som, f)
             else:

From 6b41b6a997a3a3e10f1ec9631fe65382fd9b1f2b Mon Sep 17 00:00:00 2001
From: Angus Wright <awright@astro.ruhr-uni-bochum.de>
Date: Mon, 14 Sep 2020 23:39:54 +0200
Subject: [PATCH 35/35] updated yaml

---
 example/ComplexSOM.yaml => ComplexSOM.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
 rename example/ComplexSOM.yaml => ComplexSOM.yaml (95%)

diff --git a/example/ComplexSOM.yaml b/ComplexSOM.yaml
similarity index 95%
rename from example/ComplexSOM.yaml
rename to ComplexSOM.yaml
index 0171d217..5b76fa08 100644
--- a/example/ComplexSOM.yaml
+++ b/ComplexSOM.yaml
@@ -14,7 +14,7 @@ run:
       testing: False
       bins: 5
       som_dim: [51,51]
-      num_groups: 100
+      num_groups: 200
       num_threads: 32
       group_type: 'redshift'
       whiten: False
@@ -40,6 +40,7 @@ run:
       num_groups: 200
       num_threads: 32
       group_type: 'redshift'
+      redshift_propset: 2
       data_threshold: [0,30]
       sparse_frac: 1.0
       plots: True 
@@ -52,5 +53,5 @@ run:
       # as well as the magnitudes
       colors: False
       errors: False
-      sizes: False
+      sizes: True