From a011d2147a35c2a1992101b4fc4fcb73e522ad1d Mon Sep 17 00:00:00 2001 From: David Alonso Date: Sat, 22 Aug 2020 16:11:32 +0100 Subject: [PATCH 01/35] optimizer --- the_atlantics/example.py | 74 +++++++++ the_atlantics/example_minimizer.py | 74 +++++++++ the_atlantics/snc.py | 259 +++++++++++++++++++++++++++++ 3 files changed, 407 insertions(+) create mode 100644 the_atlantics/example.py create mode 100644 the_atlantics/example_minimizer.py create mode 100644 the_atlantics/snc.py diff --git a/the_atlantics/example.py b/the_atlantics/example.py new file mode 100644 index 00000000..76a8a8fd --- /dev/null +++ b/the_atlantics/example.py @@ -0,0 +1,74 @@ +import numpy as np +import matplotlib.pyplot as plt +from snc import SnCalc + + +# Let's start with a "large" set of 12 samples +n_bins = 12 +z_cent = np.linspace(0.2, 1.4, n_bins+1)+0.5/n_bins +sigma_z = 0.05 * (1+z_cent) +z_arr = np.linspace(0, 2, 1024) +# Total number of galaxies in each sample +n_gals = (z_cent/0.5)**2*np.exp(-(z_cent/0.5)**1.5) * 10. * 1.8E4 * 60**2 + + +def pz_gau(z, zc, sz): + return np.exp(-0.5*((z-zc)/sz)**2)/np.sqrt(2*np.pi*sz**2) + + +# Create the N(z) arrays, which should integrate to the total number +# of galaxies in each sample +nzs = [ng * pz_gau(z_arr, zc, sz) + for ng, zc, sz in zip(n_gals, z_cent, sigma_z)] + +# Let's plot them +plt.figure() +for nz in nzs: + plt.plot(z_arr, nz) +plt.xlabel(r'$z$', fontsize=16) +plt.ylabel(r'$N(z)$', fontsize=16) + + +c = SnCalc(z_arr, nzs) +# Now let's do a dumb thing and bunch them up into +# 4, 3, 2 and 1 subsamples +assign4 = [np.array([0, 1, 2]), + np.array([3, 4, 5]), + np.array([6, 7, 8]), + np.array([9, 10, 11])] +assign3 = [np.array([0, 1, 2, 3]), + np.array([4, 5, 6, 7]), + np.array([8, 9, 10, 11])] +assign2 = [np.array([0, 1, 2, 3, 4, 5]), + np.array([6, 7, 8, 9, 10, 11])] +assign1 = [np.arange(n_bins)] + + +# OK, caclulate S/N for all of these groupings +snr_d4 = c.get_sn(assign4, full_output=True) +print('4 bins, S/N = %.1lf' % (snr_d4['snr'])) +snr_d3 = c.get_sn(assign3, full_output=True) +print('3 bins, S/N = %.1lf' % (snr_d3['snr'])) +snr_d2 = c.get_sn(assign2, full_output=True) +print('2 bins, S/N = %.1lf' % (snr_d2['snr'])) +snr_d1 = c.get_sn(assign1, full_output=True) +print('1 bins, S/N = %.1lf' % (snr_d1['snr'])) + + +# Note that the calculator also returns the power +# spectra, if you wanna look at them. +plt.figure() +for i in range(4): + for j in range(i, 4): + if i == j: + plt.plot(snr_d4['ls'], + snr_d4['cl'][:, i, j], + 'k-') + plt.plot(snr_d4['ls'], + snr_d4['nl'][0, i, j]*np.ones_like(snr_d4['ls']), 'k--') + else: + plt.plot(snr_d4['ls'], + snr_d4['cl'][:, i, j], + '-', c='#AAAAAA') +plt.loglog() +plt.show() diff --git a/the_atlantics/example_minimizer.py b/the_atlantics/example_minimizer.py new file mode 100644 index 00000000..697980be --- /dev/null +++ b/the_atlantics/example_minimizer.py @@ -0,0 +1,74 @@ +import numpy as np +import matplotlib.pyplot as plt +from snc import SnCalc +from scipy.optimize import minimize + + +# Let's start with a "large" set of 12 samples +n_bins = 12 +z_cent = np.linspace(0.2, 1.4, n_bins+1)+0.5/n_bins +sigma_z = 0.05 * (1+z_cent) +z_arr = np.linspace(0, 2, 1024) +# Total number of galaxies in each sample +n_gals = (z_cent/0.5)**2*np.exp(-(z_cent/0.5)**1.5) * 10. * 1.8E4 * 60**2 + + +def pz_gau(z, zc, sz): + return np.exp(-0.5*((z-zc)/sz)**2)/np.sqrt(2*np.pi*sz**2) + + +# Create the N(z) arrays, which should integrate to the total number +# of galaxies in each sample. +nzs = [ng * pz_gau(z_arr, zc, sz) + for ng, zc, sz in zip(n_gals, z_cent, sigma_z)] + +# Let's plot them +plt.figure() +for nz in nzs: + plt.plot(z_arr, nz) +plt.xlabel(r'$z$', fontsize=16) +plt.ylabel(r'$N(z)$', fontsize=16) + +# Now initialize a S/N calculator for these initial groups. +c = SnCalc(z_arr, nzs) + +# OK, let's brute-force explore the total S/N as a function +# of the bin edge for a 2-bin scenario. +zs = np.linspace(0.01, 1.5, 128) +sns = np.array([c.get_sn_from_edges(np.array([z])) for z in zs]) +plt.figure() +plt.plot(zs, sns) +plt.xlabel('$z$') +plt.ylabel('$S/N$') + +# Now let's do a 3-bin case (more coarsely-grained so we can do it quickly) +zs = np.linspace(0.01, 1.5, 32) +sns = np.array([[c.get_sn_from_edges(np.array([z1, z2])) for z1 in zs] + for z2 in zs]) + +plt.figure() +plt.imshow(sns, vmin=1440, extent=[0.01, 1.5, 0.01, 1.5], + origin='lower') +plt.xlabel('$z_1$') +plt.ylabel('$z_2$') +cb = plt.colorbar() +cb.set_label('$S/N$') + + +# Finally, let's write the function to minimize and optimize for a 4-bin case. +def minus_sn(edges, calc): + return -calc.get_sn_from_edges(edges) + + +edges_0 = np.array([0.3, 0.6, 0.9]) +res = minimize(minus_sn, edges_0, method='Powell', args=(c,)) +print("WL final edges: ", res.x) +print("Maximum S/N: ", c.get_sn_from_edges(res.x)) + + +# That was for weak lensing. Let's do the same thing for clustering. +c = SnCalc(z_arr, nzs, use_clustering=True) +res = minimize(minus_sn, edges_0, method='Powell', args=(c,)) +print("GC final edges: ", res.x) +print("Maximum S/N: ", c.get_sn_from_edges(res.x)) +plt.show() diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py new file mode 100644 index 00000000..d62379d9 --- /dev/null +++ b/the_atlantics/snc.py @@ -0,0 +1,259 @@ +import numpy as np +import pyccl as ccl +from scipy.integrate import simps +import os + + +assign_params_default = {'p_inbin_thr': 0.5, + 'p_outbin_thr': 0.2, + 'use_p_inbin': False, + 'use_p_outbin': False} + + +class SnCalc(object): + edges_large = 100. + + def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, + s_gamma=0.28, use_clustering=False): + """ S/N calculator + + Args: + z_arr (array_like): array of redshifts at which all the N(z)s are + sampled. They should be linearly spaced. + nz_list (list): list of arrays containing the redshift + distributions of all the initial groups (each array in the + list corresponds to one group). The arrays should contain the + redshift distribution sampled at `z_arr`, with its integral + over `z_arr` equal to the number of galaxies in that group. + fsky (float): sky fraction (for SNR calculation and for + transforming number of galaxies into number densities). + lmax (float): maximum ell. + d_ell (float): ell bandpower width. + s_gamma (float): rms ellipticity scatter (relevant if + `use_clustering=False`). + use_clustering (bool): if `True`, SNR will be computed for + clustering instead of lensing. + """ + self.s_gamma = s_gamma + self.fsky = fsky + self.lmax = lmax + self.d_ell = d_ell + self.larr = np.arange(2, lmax, d_ell)+d_ell/2 + self.n_ell = len(self.larr) + self.n_samples = len(nz_list) + self.z_arr = z_arr + self.nz_list = nz_list + self.n_gals = np.array([simps(nz, x=self.z_arr) + for nz in self.nz_list]) + self.pz_list = self.nz_list / self.n_gals[:, None] + self.z_means = np.array([simps(self.z_arr*nz, x=self.z_arr) + for nz in self.nz_list]) / self.n_gals + self.n_dens = self.n_gals / (4*np.pi*self.fsky) + self.cls = None + self.use_clustering = use_clustering + + def _bz_model(self, cosmo, z): + return 0.95/ccl.growth_factor(cosmo, 1./(1+z)) + + def get_cl_matrix(self, fname_save=None, recompute=False): + """ Computes matrix of power spectra between all the initial groups. + + Args: + fname_save (string): if not None, the result will be saved to a + file by this name, and if present and `recompute=False`, the + matrix will be read from that file. + recompute (bool): if `True`, any previous calculations of the + C_ell matrix stored on file will be ignored, and the matrix + will be recomputed. + """ + if fname_save is not None: + if os.path.isfile(fname_save) and not recompute: + self.cls = np.load(fname_save)['cls'] + return + + # Cosmology + cosmo = ccl.CosmologyVanillaLCDM() + + # Tracers + if self.use_clustering: + trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz), + bias=(self.z_arr, + self._bz_model(cosmo, + self.z_arr))) + for nz in self.nz_list] + else: + trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz)) + for nz in self.nz_list] + + # Cls + self.cls = np.zeros([self.n_ell, self.n_samples, self.n_samples]) + for i in range(self.n_samples): + for j in range(i, self.n_samples): + cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr) + self.cls[:, i, j] = cl + if j != i: + self.cls[:, j, i] = cl + + if fname_save is not None: + np.savez(fname_save, cls=self.cls) + + def get_resample_metadata(self, assign): + """ Transform an assignment list into a weight matrix and the + corresponding number densities. + + Args: + assign (list): list of arrays. Each element of the list should + be a numpy array of integers, corresponding to the indices + of the initial groups that are now resampled into a larger + group. + """ + n_resamples = len(assign) + weight_res = np.zeros([n_resamples, self.n_samples]) + n_dens_res = np.zeros(n_resamples) + for ia, a in enumerate(assign): + ndens = self.n_dens[a] + ndens_tot = np.sum(ndens) + n_dens_res[ia] = ndens_tot + weight_res[ia, a] = ndens / ndens_tot + return weight_res, n_dens_res + + def _get_cl_resamples(self, weights): + """ Gets C_ell matrix for resampled groups from weights matrix. + """ + if self.cls is None: + self.get_cl_matrix() + return np.einsum('jl,km,ilm', weights, weights, self.cls) + + def _get_nl_resamples(self, n_dens): + """ Gets noise contribution to C_ell matrix for resampled groups + from list of number densities. + """ + if self.use_clustering: + return np.diag(1./n_dens)[None, :, :] + else: + return np.diag(self.s_gamma**2/n_dens)[None, :, :] + + def get_sn_wn(self, weights, n_dens, full_output=False): + """ Compute signal-to-noise ratio from weights matrix and number + densities of the resampled groups. + + Args: + weights (array_like): weights matrix of shape + `(N_resample, N_initial)`, where `N_resample` is the + number of new groups, and `N_initial` is the original + number of groups. Each entry corresponds to the weight + which with a given initial group enters the new set of + groups. + n_dens (array_like): number density (in sterad^-1) of the + new groups. + full_output (bool): if true, a dictionary with additional + information will be returned. Otherwise just total S/N. + + Returns: + If `full_output=True`, dictionary containing S/N, power + spectra and noise spectra. Otherwise just S/N. + """ + sl = self._get_cl_resamples(weights) + nl = self._get_nl_resamples(n_dens) + cl = sl + nl + icl = np.linalg.inv(cl) + sn2_1pt = np.sum(sl[:, :, :, None] * icl[:, None, :, :], axis=2) + sn2_ell = np.sum(sn2_1pt[:, :, :, None] * sn2_1pt[:, None, :, :], + axis=2) + trsn2_ell = np.trace(sn2_ell, axis1=1, axis2=2) + snr = np.sqrt(np.sum(trsn2_ell * (2*self.larr + 1) * + self.d_ell / self.fsky)) + + if full_output: + dret = {'snr': snr, 'cl': sl, 'nl': nl, 'ls': self.larr} + return dret + else: + return snr + + def get_sn(self, assign, full_output=False): + """ Compute signal-to-noise ratio from a bin assignment list. + + Args: + assign (list): list of arrays. Each element of the list should + be a numpy array of integers, corresponding to the indices + of the initial groups that are now resampled into a larger + group. + full_output (bool): if true, a dictionary with additional + information will be returned. Otherwise just total S/N. + + Returns: + If `full_output=True`, dictionary containing S/N, power + spectra and noise spectra. Otherwise just S/N. + """ + w, ndens = self.get_resample_metadata(assign) + return self.get_sn_wn(w, ndens, + full_output=full_output) + + def check_edges(self, edges): + """ Returns `True` if there's something wrong with the edges. + """ + return np.any(edges < 0) or \ + np.any(edges > self.edges_large) or \ + np.any(np.diff(edges) < 0) + + def assign_from_edges(self, edges, assign_params=assign_params_default): + """ Get assignment list from edges and assign parameters. + + Args: + edges (array_like): array of bin edges. + assign_params (dict): dictionary of assignment parameters + (see `assign_params_default`). + + Returns: + List of assignment arrays ready to be used in e.g. `get_sn`. + """ + nbins = len(edges) + 1 + # Bin IDs based on mean z + ids = np.digitize(self.z_means, bins=edges) + if assign_params['use_p_inbin'] or assign_params['use_p_outbin']: + # Calculate probabilities in each bin + zgroups = np.digitize(self.z_arr, bins=edges) + masks = [zgroups == i for i in range(nbins)] + if assign_params['use_p_outbin']: + # Matrix of probabilities in each bin + pzs = np.array([[simps(pz[m], x=self.z_arr[m]) + for m in masks] + for pz in self.pz_list]) + pzd = np.array([pzs[j, ids[j]] + for j in range(self.n_samples)]) + else: + # Overlaps in own bin + pzd = np.array([simps(pz[masks[i]], + x=self.z_arr[masks[i]]) + for pz, i in zip(self.pz_list, ids)]) + + # Throw away based on in-bin probability + if assign_params['use_p_inbin']: + ids[pzd < assign_params['p_inbin_thr']] = -1 + # Throw away based on off-bin probability + if assign_params['use_p_outbin']: + ids[np.array([np.sum(p > assign_params['p_outbin_thr']) > 1 + for p in pzs])] = -1 + # Assignment list + return [np.where(ids == i)[0] for i in np.unique(ids)] + + def get_sn_from_edges(self, edges, full_output=False, + assign_params=assign_params_default): + """ Compute signal-to-noise ratio from a set of edges and assignment + parameters. + + Args: + edges (array_like): array of bin edges. + assign_params (dict): dictionary of assignment parameters + (see `assign_params_default`). + full_output (bool): if true, a dictionary with additional + information will be returned. Otherwise just total S/N. + + Returns: + If `full_output=True`, dictionary containing S/N, power + spectra and noise spectra. Otherwise just S/N. + """ + if self.check_edges(edges): + return 0 + assign = self.assign_from_edges(edges, assign_params=assign_params) + return self.get_sn(assign, full_output=full_output) From a0317532b78f650d3909bbdd4f63433f2d85c1a8 Mon Sep 17 00:00:00 2001 From: David Alonso Date: Sat, 22 Aug 2020 16:36:01 +0100 Subject: [PATCH 02/35] tested for 100 samples --- the_atlantics/example_minimizer.py | 38 ++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/the_atlantics/example_minimizer.py b/the_atlantics/example_minimizer.py index 697980be..8e168c7c 100644 --- a/the_atlantics/example_minimizer.py +++ b/the_atlantics/example_minimizer.py @@ -5,7 +5,7 @@ # Let's start with a "large" set of 12 samples -n_bins = 12 +n_bins = 100 z_cent = np.linspace(0.2, 1.4, n_bins+1)+0.5/n_bins sigma_z = 0.05 * (1+z_cent) z_arr = np.linspace(0, 2, 1024) @@ -30,20 +30,31 @@ def pz_gau(z, zc, sz): plt.ylabel(r'$N(z)$', fontsize=16) # Now initialize a S/N calculator for these initial groups. -c = SnCalc(z_arr, nzs) - +c_wl = SnCalc(z_arr, nzs, use_clustering=False) +c_gc = SnCalc(z_arr, nzs, use_clustering=True) +print("Initializing WL") +c_wl.get_cl_matrix(fname_save='wl_nb%d.npz' % n_bins) +print("Initializing GC") +c_gc.get_cl_matrix(fname_save='gc_nb%d.npz' % n_bins) + +# Let's focus on WL for now. # OK, let's brute-force explore the total S/N as a function # of the bin edge for a 2-bin scenario. +print("Exploring 2-bins") zs = np.linspace(0.01, 1.5, 128) -sns = np.array([c.get_sn_from_edges(np.array([z])) for z in zs]) +sns_wl = np.array([c_wl.get_sn_from_edges(np.array([z])) for z in zs]) +sns_gc = np.array([c_gc.get_sn_from_edges(np.array([z])) for z in zs]) plt.figure() -plt.plot(zs, sns) +plt.plot(zs, sns_wl/np.amax(sns_wl), 'r-', label='Lensing') +plt.plot(zs, sns_gc/np.amax(sns_gc), 'k-', label='Clustering') plt.xlabel('$z$') -plt.ylabel('$S/N$') +plt.ylabel(r'$(S/N)/(S/N)_{\rm max}$') +plt.legend(loc='upper left', frameon=False) # Now let's do a 3-bin case (more coarsely-grained so we can do it quickly) +print("Exploring 3-bins") zs = np.linspace(0.01, 1.5, 32) -sns = np.array([[c.get_sn_from_edges(np.array([z1, z2])) for z1 in zs] +sns = np.array([[c_wl.get_sn_from_edges(np.array([z1, z2])) for z1 in zs] for z2 in zs]) plt.figure() @@ -60,15 +71,16 @@ def minus_sn(edges, calc): return -calc.get_sn_from_edges(edges) +print("Optimizing WL") edges_0 = np.array([0.3, 0.6, 0.9]) -res = minimize(minus_sn, edges_0, method='Powell', args=(c,)) +res = minimize(minus_sn, edges_0, method='Powell', args=(c_wl,)) print("WL final edges: ", res.x) -print("Maximum S/N: ", c.get_sn_from_edges(res.x)) - +print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x)) +print(" ") # That was for weak lensing. Let's do the same thing for clustering. -c = SnCalc(z_arr, nzs, use_clustering=True) -res = minimize(minus_sn, edges_0, method='Powell', args=(c,)) +print("Optimizing GC") +res = minimize(minus_sn, edges_0, method='Powell', args=(c_gc,)) print("GC final edges: ", res.x) -print("Maximum S/N: ", c.get_sn_from_edges(res.x)) +print("Maximum S/N: ", c_gc.get_sn_from_edges(res.x)) plt.show() From 95eb643208818e8be45bcd2d7b441f82fa5e25f9 Mon Sep 17 00:00:00 2001 From: Bela Abolfathi Date: Fri, 28 Aug 2020 12:16:19 -0700 Subject: [PATCH 03/35] Add simple and complex classifiers --- tomo_challenge/classifiers/ComplexSOM.py | 352 +++++++++++++++++++++++ tomo_challenge/classifiers/SimpleSOM.py | 323 +++++++++++++++++++++ 2 files changed, 675 insertions(+) create mode 100644 tomo_challenge/classifiers/ComplexSOM.py create mode 100644 tomo_challenge/classifiers/SimpleSOM.py diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py new file mode 100644 index 00000000..501617e8 --- /dev/null +++ b/tomo_challenge/classifiers/ComplexSOM.py @@ -0,0 +1,352 @@ +""" +This is an example tomographic bin generator using a random forest. + +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. + +See Classifier Documentation below. +""" + +from .base import Tomographer +import numpy as np +import rpy2 +import rpy2.robjects as ro +import rpy2.robjects.packages as rpack +from rpy2.robjects.vectors import StrVector, IntVector, DataFrame, FloatVector, FactorVector +import pandas as pd +from rpy2.robjects import pandas2ri +from rpy2.robjects.conversion import localconverter +from ..snc import SnCalc +from scipy.optimize import minimize +from scipy.integrate import simps + +#Check that all the needed packages are installed +# R package nameo +packnames = ('data.table','itertools','foreach','doParallel','RColorBrewer','devtools','matrixStats') +base=ro.packages.importr("base") +utils=ro.packages.importr("utils") +stats=ro.packages.importr("stats") +gr=ro.packages.importr("graphics") +dev=ro.packages.importr("grDevices") +utils.chooseCRANmirror(ind=1) +# Selectively install what needs to be installed. +names_to_install = [x for x in packnames if not rpack.isinstalled(x)] +if len(names_to_install) > 0: + utils.install_packages(StrVector(names_to_install)) +base.Sys_setenv(TAR=base.system("which tar",intern=True)) +devtools=ro.packages.importr("devtools") +devtools.install_github("AngusWright/helpRfuncs") +devtools.install_github("AngusWright/kohonen/kohonen") +kohonen=ro.packages.importr("kohonen") + +class ComplexSOM(Tomographer): + """ Simplistic SOM Classifier """ + + # valid parameter -- see below + valid_options = ['bins','som_dim','num_groups','num_threads', + 'group_type','data_threshold','sparse_frac','plots','plot_dir','metric'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = False + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + + Note: + ----- + Valiad options are: + 'bins' - number of tomographic bins + 'som_dim' - dimensions of the SOM + 'num_groups' - number of SOM groups/associations + 'num_threads' - number of threads to use + 'sparse_frac' - the sparse-sampling fraction used for training + 'group_type' - do we want grouping by colour or redshift + 'data_threshold' - number of threads to use + + """ + self.bands = bands + self.opt = options + + def train (self, training_data, training_z): + """Trains the SOM and outputs the resulting bins + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + + """ + + print("Initialising") + #Number of tomographic bins + n_bin = self.opt['bins'] + #Dimensions of the SOM + som_dim = self.opt['som_dim'] + #Number of discrete SOM groups + num_groups = self.opt['num_groups'] + #Number of threads + num_threads = self.opt['num_threads'] + #Sparse Frac + sparse_frac = self.opt['sparse_frac'] + #Data Threshold + data_threshold = self.opt['data_threshold'] + #Metric + metric = self.opt['metric'] + #Plots + plots = self.opt['plots'] + #Plot Output Directory + plot_dir = self.opt['plot_dir'] + #Group Type + group_type = self.opt['group_type'] + #Check that the group type is a valid choice + if group_type != 'colour' and group_type != 'redshift': + group_type = 'redshift' + + #Define the redshift summary statistics (used for making groups in the 'redshift' case + property_labels = ("mean_z_true","med_z_true","sd_z_true","mad_z_true","N","iqr_z_true") + property_expressions = ("mean(data$redshift_true)","median(data$redshift_true)","sd(data$redshift_true)", + "mad(data$redshift_true)","nrow(data)", + "diff(quantile(data$redshift_true,probs=pnorm(c(-2,2))))") + #Define the SOM variables + if self.bands == 'riz': + #riz bands + #expressions = ("r_mag-i_mag","r_mag-z_mag","i_mag-z_mag", + # "z_mag","r_mag-i_mag-(i_mag-z_mag)") + expressions = ("r-i","r-z","i-z", + "z","r-i-(i-z)") + elif self.bands == 'griz': + #griz bands + #expressions = ("g_mag-r_mag","g_mag-i_mag", + # "g_mag-z_mag","r_mag-i_mag","r_mag-z_mag","i_mag-z_mag", + # "z_mag","g_mag-r_mag-(r_mag-i_mag)", + # "r_mag-i_mag-(i_mag-z_mag)") + expressions = ("g-r","g-i", + "g-z","r-i","r-z","i-z", + "z","g-r-(r-i)", + "r-i-(i-z)") + elif self.bands == 'grizy': + #grizy bands + #expressions = ("g_mag-r_mag","g_mag-i_mag", + # "g_mag-z_mag","g_mag-y_mag","r_mag-i_mag","r_mag-z_mag","r_mag-y_mag","i_mag-z_mag","i_mag-y_mag", + # "z_mag-y_mag","z_mag","g_mag-r_mag-(r_mag-i_mag)", + # "r_mag-i_mag-(i_mag-z_mag)","i_mag-z_mag-(z_mag-y_mag)") + expressions = ("g-r","g-i", + "g-z","g-y","r-i","r-z","r-y","i-z","i-y", + "z-y","z","g-r-(r-i)", + "r-i-(i-z)","i-z-(z-y)") + elif self.bands == 'ugriz': + #ugrizy bands + #expressions = ("u_mag-g_mag","u_mag-r_mag","u_mag-i_mag","u_mag-z_mag","g_mag-r_mag","g_mag-i_mag", + # "g_mag-z_mag","r_mag-i_mag","r_mag-z_mag","i_mag-z_mag", + # "z_mag","u_mag-g_mag-(g_mag-r_mag)","g_mag-r_mag-(r_mag-i_mag)", + # "r_mag-i_mag-(i_mag-z_mag)") + expressions = ("u-g","u-r","u-i","u-z","g-r","g-i", + "g-z","r-i","r-z","i-z", + "z","u-g-(g-r)","g-r-(r-i)", + "r-i-(i-z)") + elif self.bands == 'ugrizy': + #ugrizy bands + #expressions = ("u_mag-g_mag","u_mag-r_mag","u_mag-i_mag","u_mag-z_mag","u_mag-y_mag","g_mag-r_mag","g_mag-i_mag", + # "g_mag-z_mag","g_mag-y_mag","r_mag-i_mag","r_mag-z_mag","r_mag-y_mag","i_mag-z_mag","i_mag-y_mag", + # "z_mag-y_mag","z_mag","u_mag-g_mag-(g_mag-r_mag)","g_mag-r_mag-(r_mag-i_mag)", + # "r_mag-i_mag-(i_mag-z_mag)","i_mag-z_mag-(z_mag-y_mag)") + expressions = ("u-g","u-r","u-i","u-z","u-y","g-r","g-i", + "g-z","g-y","r-i","r-z","r-y","i-z","i-y", + "z-y","z","u-g-(g-r)","g-r-(r-i)", + "r-i-(i-z)","i-z-(z-y)") + + print("Preparing the data") + training_data = pd.DataFrame.from_dict(training_data) + #Add the redshift variable to the train data + print("Adding redshift info to training data") + training_data['redshift_true'] = training_z + + if sparse_frac < 1: + print("Sparse Sampling the training data") + cut = np.random.uniform(0, 1, training_z.size) < sparse_frac + training_data = training_data[cut] + training_z = training_z[cut] + + #Construct the training data frame (just a python-to-R data conversion) + print("Converting the data to R format") + with localconverter(ro.default_converter + pandas2ri.converter): + #train_df = ro.conversion.py2rpy(train[['u_mag','g_mag','r_mag','i_mag','z_mag','y_mag']]) + train_df = ro.conversion.py2rpy(training_data) + + print("Training the SOM using R kohtrain") + #Train the SOM using R kohtrain + som=kohonen.kohtrain(data=train_df,som_dim=IntVector(som_dim),max_na_frac=0,data_threshold=FloatVector(data_threshold), + n_cores=num_threads,train_expr=StrVector(expressions),train_sparse=False,sparse_frac=sparse_frac) + + #If grouping by redshift, construct the cell redshift statistics + if group_type == 'redshift' or plots == True: + print("Constructing cell-based redshift properties") + #Construct the Nz properties per SOM cell + cell_prop=kohonen.generate_kohgroup_property(som=som,data=train_df, + expression=StrVector(property_expressions),expr_label=StrVector(property_labels)) + print("Constructing redshift-based hierarchical cluster tree") + #Cluster the SOM cells into num_groups groups + hclust=stats.hclust(stats.dist(cell_prop.rx2['property'])) + cell_group=stats.cutree(hclust,k=num_groups) + + #Assign the cell groups to the SOM structure + som.rx2['hclust']=hclust + som.rx2['cell_clust']=cell_group + + #Construct the Nz properties per SOM group + print("Constructing group-based redshift properties") + group_prop=kohonen.generate_kohgroup_property(som=som,data=train_df, + expression=StrVector(property_expressions),expr_label=StrVector(property_labels), + n_cluster_bins=num_groups) + + #extract the training som (just for convenience) + train_som = group_prop.rx2('som') + + if plots == True: + #Make the diagnostic plots + props = group_prop.rx2('property') + print(base.colnames(props)) + print("Constructing SOM diagnostic figures") + #Initialise the plot device + dev.png(file=f'{plot_dir}/SOMfig_%02d.png',height=5,width=5,res=220,unit='in') + #Paint the SOM by the training properties + for i in range(len(expressions)): + gr.plot(train_som,property=i+1,shape='straight',heatkeywidth=som_dim[0]/20,ncolors=1e3,zlim=FloatVector([-3,3])) + #Plot the standard diagnostics: + #Count + gr.plot(train_som,type='count',shape='straight',heatkeywidth=som_dim[0]/20,zlog=True,ncolors=1e3) + #Quality + gr.plot(train_som,type='quality',shape='straight',heatkeywidth=som_dim[0]/20,zlim=FloatVector([0,0.5]),ncolors=1e3) + #UMatrix + gr.plot(train_som,type='dist',shape='straight',heatkeywidth=som_dim[0]/20,zlog=True,zlim=FloatVector([0,0.5]),ncolors=1e3) + #Sometimes these figures need customised limits; the "try" stops bad plots from stopping the code + + #Paint by the redshift diagnostics: + #mean redshift + gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='mean_z_true')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group mean redshift',zlim=FloatVector([0,1.8])) + #redshift std + gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='sd_z_true')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group redshift stdev',zlim=FloatVector([0,0.2])) + #2sigma redshift IQR + gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='iqr_z_true')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='group 2sigma IQR',zlim=FloatVector([0,0.4])) + #N group + gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='N')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='N group') + + #Close the plot device + dev.dev_off() + + print("Constructing the per-group Nz") + #Get the group assignments for the training data + train_group = np.array(train_som.rx2['clust.classif']) + tab=base.table(FactorVector(train_som.rx2['clust.classif'],levels=base.seq(num_groups))) + #Construct the Nz z-axis + z_arr = np.linspace(0, 2, 1024) + z_cen = z_arr[0:-1]+(z_arr[1]-z_arr[0])/2. + #Construct the per-group Nz + nzs = [(np.histogram(training_z[train_group == group+1],z_arr)[0]) for group in np.arange(num_groups)[np.array(tab)!=0]] + # Now initialize a S/N calculator for these initial groups. + if metric == 'SNR_ww': + c_wl = SnCalc(z_cen, nzs, use_clustering=False) + else: + c_wl = SnCalc(z_cen, nzs, use_clustering=True) + print("Initializing WL") + c_wl.get_cl_matrix(fname_save='wl_nb%d.npz' % n_bin) + # Finally, let's write the function to minimize and optimize for a 4-bin case. + def minus_sn(edges, calc): + return -calc.get_sn_from_edges(edges) + + print("Optimizing WL") + edges_0 = np.linspace(0, 2, n_bin-1) + res = minimize(minus_sn, edges_0, method='Powell', args=(c_wl,)) + print("WL final edges: ", res.x) + print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x)) + print(" ") + + + + #Construct the per-group Nz + print("Outputting trained SOM and redshift ordering of SOM groups") + + print("Finished") + self.train_som = train_som + self.group_z = group_z + self.z_order = z_order + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + + #Number of tomographic bins + n_bin = self.opt['bins'] + + print("Preparing the data") + data = pd.DataFrame.from_dict(data) + + #Construct the validation data frame (just a python-to-R data conversion) + print("Converting the data to R format") + with localconverter(ro.default_converter + pandas2ri.converter): + #train_df = ro.conversion.py2rpy(train[['u_mag','g_mag','r_mag','i_mag','z_mag','y_mag']]) + data_df = ro.conversion.py2rpy(data) + + print("Parsing the validation data into the SOM groupings") + #Generate the validation associations/groups + group_prop=kohonen.generate_kohgroup_property(som=self.train_som,data=data_df, + expression="nrow(data)",expr_label="N", + n_cluster_bins=self.opt['num_groups'],n_cores=self.opt['num_threads']) + + #Calculate the cumulative count + print("Generate cumulative source counts a.f.o. group mean z") + zcumsum=base.cumsum(group_prop.rx2('property').rx(self.z_order)) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + print("Assign the groups to tomographic bins") + p = np.linspace(0, 100, n_bin + 1) + n_edges = np.percentile(zcumsum, p) + + # Now put the groups into redshift bins. + group_bins = FloatVector(base.cut(FloatVector(zcumsum),FloatVector(n_edges), + include=True)).rx(base.order(self.z_order)) + + #extract the validation som (just for convenience) + valid_som = group_prop.rx2('som') + + #Assign the sources, by group, to tomographic bins + print("Output source tomographic bin assignments") + valid_bin = base.unsplit(group_bins,FactorVector(valid_som.rx2['clust.classif'], + levels=base.seq(self.opt['num_groups'])),drop=False) + #valid_bin = valid_som.rx2['clust.classif'] + valid_bin = np.array(valid_bin)-1 + + return valid_bin + diff --git a/tomo_challenge/classifiers/SimpleSOM.py b/tomo_challenge/classifiers/SimpleSOM.py new file mode 100644 index 00000000..f3b0b74c --- /dev/null +++ b/tomo_challenge/classifiers/SimpleSOM.py @@ -0,0 +1,323 @@ +""" +This is an example tomographic bin generator using a random forest. + +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. + +See Classifier Documentation below. +""" + +from .base import Tomographer +import numpy as np +import rpy2 +import rpy2.robjects as ro +import rpy2.robjects.packages as rpack +from rpy2.robjects.vectors import StrVector, IntVector, DataFrame, FloatVector, FactorVector +import pandas as pd +from rpy2.robjects import pandas2ri +from rpy2.robjects.conversion import localconverter + +#Check that all the needed packages are installed +# R package nameo +packnames = ('data.table','itertools','foreach','doParallel','RColorBrewer','devtools','matrixStats') +base=ro.packages.importr("base") +utils=ro.packages.importr("utils") +stats=ro.packages.importr("stats") +gr=ro.packages.importr("graphics") +dev=ro.packages.importr("grDevices") +utils.chooseCRANmirror(ind=1) +# Selectively install what needs to be installed. +names_to_install = [x for x in packnames if not rpack.isinstalled(x)] +if len(names_to_install) > 0: + utils.install_packages(StrVector(names_to_install)) +base.Sys_setenv(TAR=base.system("which tar",intern=True)) +devtools=ro.packages.importr("devtools") +devtools.install_github("AngusWright/helpRfuncs") +devtools.install_github("AngusWright/kohonen/kohonen") +kohonen=ro.packages.importr("kohonen") + +class SimpleSOM(Tomographer): + """ Simplistic SOM Classifier """ + + # valid parameter -- see below + valid_options = ['bins','som_dim','num_groups','num_threads', + 'group_type','data_threshold','sparse_frac','plots','plot_dir'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = False + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + + Note: + ----- + Valiad options are: + 'bins' - number of tomographic bins + 'som_dim' - dimensions of the SOM + 'num_groups' - number of SOM groups/associations + 'num_threads' - number of threads to use + 'sparse_frac' - the sparse-sampling fraction used for training + 'group_type' - do we want grouping by colour or redshift + 'data_threshold' - number of threads to use + + """ + self.bands = bands + self.opt = options + + def train (self, training_data, training_z): + """Trains the SOM and outputs the resulting bins + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + + """ + + print("Initialising") + #Number of tomographic bins + n_bin = self.opt['bins'] + #Dimensions of the SOM + som_dim = self.opt['som_dim'] + #Number of discrete SOM groups + num_groups = self.opt['num_groups'] + #Number of threads + num_threads = self.opt['num_threads'] + #Sparse Frac + sparse_frac = self.opt['sparse_frac'] + #Data Threshold + data_threshold = self.opt['data_threshold'] + #Plots + plots = self.opt['plots'] + #Plot Output Directory + plot_dir = self.opt['plot_dir'] + #Group Type + group_type = self.opt['group_type'] + #Check that the group type is a valid choice + if group_type != 'colour' and group_type != 'redshift': + group_type = 'redshift' + + #Define the redshift summary statistics (used for making groups in the 'redshift' case + property_labels = ("mean_z_true","med_z_true","sd_z_true","mad_z_true","N","iqr_z_true") + property_expressions = ("mean(data$redshift_true)","median(data$redshift_true)","sd(data$redshift_true)", + "mad(data$redshift_true)","nrow(data)", + "diff(quantile(data$redshift_true,probs=pnorm(c(-2,2))))") + #Define the SOM variables + if self.bands == 'riz': + #riz bands + #expressions = ("r_mag-i_mag","r_mag-z_mag","i_mag-z_mag", + # "z_mag","r_mag-i_mag-(i_mag-z_mag)") + expressions = ("r-i","r-z","i-z", + "z","r-i-(i-z)") + elif self.bands == 'griz': + #griz bands + #expressions = ("g_mag-r_mag","g_mag-i_mag", + # "g_mag-z_mag","r_mag-i_mag","r_mag-z_mag","i_mag-z_mag", + # "z_mag","g_mag-r_mag-(r_mag-i_mag)", + # "r_mag-i_mag-(i_mag-z_mag)") + expressions = ("g-r","g-i", + "g-z","r-i","r-z","i-z", + "z","g-r-(r-i)", + "r-i-(i-z)") + elif self.bands == 'grizy': + #grizy bands + #expressions = ("g_mag-r_mag","g_mag-i_mag", + # "g_mag-z_mag","g_mag-y_mag","r_mag-i_mag","r_mag-z_mag","r_mag-y_mag","i_mag-z_mag","i_mag-y_mag", + # "z_mag-y_mag","z_mag","g_mag-r_mag-(r_mag-i_mag)", + # "r_mag-i_mag-(i_mag-z_mag)","i_mag-z_mag-(z_mag-y_mag)") + expressions = ("g-r","g-i", + "g-z","g-y","r-i","r-z","r-y","i-z","i-y", + "z-y","z","g-r-(r-i)", + "r-i-(i-z)","i-z-(z-y)") + elif self.bands == 'ugriz': + #ugrizy bands + #expressions = ("u_mag-g_mag","u_mag-r_mag","u_mag-i_mag","u_mag-z_mag","g_mag-r_mag","g_mag-i_mag", + # "g_mag-z_mag","r_mag-i_mag","r_mag-z_mag","i_mag-z_mag", + # "z_mag","u_mag-g_mag-(g_mag-r_mag)","g_mag-r_mag-(r_mag-i_mag)", + # "r_mag-i_mag-(i_mag-z_mag)") + expressions = ("u-g","u-r","u-i","u-z","g-r","g-i", + "g-z","r-i","r-z","i-z", + "z","u-g-(g-r)","g-r-(r-i)", + "r-i-(i-z)") + elif self.bands == 'ugrizy': + #ugrizy bands + #expressions = ("u_mag-g_mag","u_mag-r_mag","u_mag-i_mag","u_mag-z_mag","u_mag-y_mag","g_mag-r_mag","g_mag-i_mag", + # "g_mag-z_mag","g_mag-y_mag","r_mag-i_mag","r_mag-z_mag","r_mag-y_mag","i_mag-z_mag","i_mag-y_mag", + # "z_mag-y_mag","z_mag","u_mag-g_mag-(g_mag-r_mag)","g_mag-r_mag-(r_mag-i_mag)", + # "r_mag-i_mag-(i_mag-z_mag)","i_mag-z_mag-(z_mag-y_mag)") + expressions = ("u-g","u-r","u-i","u-z","u-y","g-r","g-i", + "g-z","g-y","r-i","r-z","r-y","i-z","i-y", + "z-y","z","u-g-(g-r)","g-r-(r-i)", + "r-i-(i-z)","i-z-(z-y)") + + print("Preparing the data") + training_data = pd.DataFrame.from_dict(training_data) + #Add the redshift variable to the train data + print("Adding redshift info to training data") + training_data['redshift_true'] = training_z + + if sparse_frac < 1: + print("Sparse Sampling the training data") + cut = np.random.uniform(0, 1, training_z.size) < sparse_frac + training_data = training_data[cut] + + #Construct the training data frame (just a python-to-R data conversion) + print("Converting the data to R format") + with localconverter(ro.default_converter + pandas2ri.converter): + #train_df = ro.conversion.py2rpy(train[['u_mag','g_mag','r_mag','i_mag','z_mag','y_mag']]) + train_df = ro.conversion.py2rpy(training_data) + + print("Training the SOM using R kohtrain") + #Train the SOM using R kohtrain + som=kohonen.kohtrain(data=train_df,som_dim=IntVector(som_dim),max_na_frac=0,data_threshold=FloatVector(data_threshold), + n_cores=num_threads,train_expr=StrVector(expressions),train_sparse=False,sparse_frac=sparse_frac) + + #If grouping by redshift, construct the cell redshift statistics + if group_type == 'redshift' or plots == True: + print("Constructing cell-based redshift properties") + #Construct the Nz properties per SOM cell + cell_prop=kohonen.generate_kohgroup_property(som=som,data=train_df, + expression=StrVector(property_expressions),expr_label=StrVector(property_labels)) + print("Constructing redshift-based hierarchical cluster tree") + #Cluster the SOM cells into num_groups groups + hclust=stats.hclust(stats.dist(cell_prop.rx2['property'])) + cell_group=stats.cutree(hclust,k=num_groups) + + #Assign the cell groups to the SOM structure + som.rx2['hclust']=hclust + som.rx2['cell_clust']=cell_group + + #Construct the Nz properties per SOM group + print("Constructing group-based redshift properties") + group_prop=kohonen.generate_kohgroup_property(som=som,data=train_df, + expression=StrVector(property_expressions),expr_label=StrVector(property_labels), + n_cluster_bins=num_groups) + + #extract the training som (just for convenience) + train_som = group_prop.rx2('som') + + if plots == True: + #Make the diagnostic plots + props = group_prop.rx2('property') + print(base.colnames(props)) + print("Constructing SOM diagnostic figures") + #Initialise the plot device + dev.png(file=f'{plot_dir}/SOMfig_%02d.png',height=5,width=5,res=220,unit='in') + #Paint the SOM by the training properties + for i in range(len(expressions)): + gr.plot(train_som,property=i+1,shape='straight',heatkeywidth=som_dim[0]/20,ncolors=1e3,zlim=FloatVector([-3,3])) + #Plot the standard diagnostics: + #Count + gr.plot(train_som,type='count',shape='straight',heatkeywidth=som_dim[0]/20,zlog=True,ncolors=1e3) + #Quality + gr.plot(train_som,type='quality',shape='straight',heatkeywidth=som_dim[0]/20,zlim=FloatVector([0,0.5]),ncolors=1e3) + #UMatrix + gr.plot(train_som,type='dist',shape='straight',heatkeywidth=som_dim[0]/20,zlog=True,zlim=FloatVector([0,0.5]),ncolors=1e3) + #Sometimes these figures need customised limits; the "try" stops bad plots from stopping the code + + #Paint by the redshift diagnostics: + #mean redshift + gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='mean_z_true')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group mean redshift',zlim=FloatVector([0,1.8])) + #redshift std + gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='sd_z_true')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group redshift stdev',zlim=FloatVector([0,0.2])) + #2sigma redshift IQR + gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='iqr_z_true')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='group 2sigma IQR',zlim=FloatVector([0,0.4])) + #N group + gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='N')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='N group') + + #Close the plot device + dev.dev_off() + + + + print("Outputting trained SOM and redshift ordering of SOM groups") + #Extract the mean-z per group + group_z = group_prop.rx2('property').rx(True, + base.which(group_prop.rx2('property').colnames.ro=='mean_z_true')) + #Order the groups by mean z + z_order = base.order(group_z) + + print("Finished") + self.train_som = train_som + self.group_z = group_z + self.z_order = z_order + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + + #Number of tomographic bins + n_bin = self.opt['bins'] + + print("Preparing the data") + data = pd.DataFrame.from_dict(data) + + #Construct the validation data frame (just a python-to-R data conversion) + print("Converting the data to R format") + with localconverter(ro.default_converter + pandas2ri.converter): + #train_df = ro.conversion.py2rpy(train[['u_mag','g_mag','r_mag','i_mag','z_mag','y_mag']]) + data_df = ro.conversion.py2rpy(data) + + print("Parsing the validation data into the SOM groupings") + #Generate the validation associations/groups + group_prop=kohonen.generate_kohgroup_property(som=self.train_som,data=data_df, + expression="nrow(data)",expr_label="N", + n_cluster_bins=self.opt['num_groups'],n_cores=self.opt['num_threads']) + + #Calculate the cumulative count + print("Generate cumulative source counts a.f.o. group mean z") + zcumsum=base.cumsum(group_prop.rx2('property').rx(self.z_order)) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + print("Assign the groups to tomographic bins") + p = np.linspace(0, 100, n_bin + 1) + n_edges = np.percentile(zcumsum, p) + + # Now put the groups into redshift bins. + group_bins = FloatVector(base.cut(FloatVector(zcumsum),FloatVector(n_edges), + include=True)).rx(base.order(self.z_order)) + + #extract the validation som (just for convenience) + valid_som = group_prop.rx2('som') + + #Assign the sources, by group, to tomographic bins + print("Output source tomographic bin assignments") + valid_bin = base.unsplit(group_bins,FactorVector(valid_som.rx2['clust.classif'], + levels=base.seq(self.opt['num_groups'])),drop=False) + #valid_bin = valid_som.rx2['clust.classif'] + valid_bin = np.array(valid_bin)-1 + + return valid_bin + From 57f15155e6acbc3710e50b6724e88c16ae9d40b4 Mon Sep 17 00:00:00 2001 From: Bela Abolfathi Date: Fri, 28 Aug 2020 12:21:45 -0700 Subject: [PATCH 04/35] Added simple and complex .yaml --- example/ComplexSOM.yaml | 29 +++++++++++++++++++++++++++++ example/SimpleSOM.yaml | 28 ++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 example/ComplexSOM.yaml create mode 100644 example/SimpleSOM.yaml diff --git a/example/ComplexSOM.yaml b/example/ComplexSOM.yaml new file mode 100644 index 00000000..63a97e3e --- /dev/null +++ b/example/ComplexSOM.yaml @@ -0,0 +1,29 @@ +metrics: [SNR_3x2] +bands: riz +training_file: data/training.hdf5 +validation_file: data/validation_small.hdf5 +output_file: example/example_output.txt +# Backend implementing the metrics, either: "firecrown" (default), "jax-cosmo" +metrics_impl: firecrown + +run: + # This is a class name which will be looked up + ComplexSOM: + run_3: + # These settings are sent to the classifier + bins: 5 + som_dim: [21,21] + num_groups: 100 + num_threads: 32 + group_type: 'redshift' + data_threshold: [0,30] + sparse_frac: 0.05 + plots: True + plot_dir: 'plots' + metric: "SNR_3x2" + # These special settings decide whether the + # color and error colums are passed to the classifier + # as well as the magnitudes + colors: False + errors: False + diff --git a/example/SimpleSOM.yaml b/example/SimpleSOM.yaml new file mode 100644 index 00000000..6f261f85 --- /dev/null +++ b/example/SimpleSOM.yaml @@ -0,0 +1,28 @@ +metrics: [SNR_3x2] +bands: riz +training_file: data/training.hdf5 +validation_file: data/validation_small.hdf5 +output_file: example/example_output.txt +# Backend implementing the metrics, either: "firecrown" (default), "jax-cosmo" +metrics_impl: firecrown + +run: + # This is a class name which will be looked up + SimpleSOM: + run_3: + # These settings are sent to the classifier + bins: 10 + som_dim: [151,151] + num_groups: 200 + num_threads: 32 + group_type: 'redshift' + data_threshold: [0,30] + sparse_frac: 0.05 + plots: True + plot_dir: 'plots' + # These special settings decide whether the + # color and error colums are passed to the classifier + # as well as the magnitudes + colors: False + errors: False + From a6465c1b0f3b6ca9bc062b980a257da268d7da60 Mon Sep 17 00:00:00 2001 From: Bela Abolfathi Date: Fri, 28 Aug 2020 12:23:16 -0700 Subject: [PATCH 05/35] Add updated snc.py --- the_atlantics/snc.py | 90 ++++++++++++++++++++------------------------ 1 file changed, 41 insertions(+), 49 deletions(-) diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py index d62379d9..27aa0c09 100644 --- a/the_atlantics/snc.py +++ b/the_atlantics/snc.py @@ -5,20 +5,19 @@ assign_params_default = {'p_inbin_thr': 0.5, - 'p_outbin_thr': 0.2, - 'use_p_inbin': False, - 'use_p_outbin': False} + 'p_outbin_thr': 0.2, + 'use_p_inbin': False, + 'use_p_outbin': False} class SnCalc(object): edges_large = 100. def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, - s_gamma=0.28, use_clustering=False): + s_gamma=0.28, use_clustering=False): """ S/N calculator - Args: - z_arr (array_like): array of redshifts at which all the N(z)s are + z_arr (array_like): array of redshifts at which all the N(z)s are sampled. They should be linearly spaced. nz_list (list): list of arrays containing the redshift distributions of all the initial groups (each array in the @@ -44,10 +43,10 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, self.z_arr = z_arr self.nz_list = nz_list self.n_gals = np.array([simps(nz, x=self.z_arr) - for nz in self.nz_list]) + for nz in self.nz_list]) self.pz_list = self.nz_list / self.n_gals[:, None] self.z_means = np.array([simps(self.z_arr*nz, x=self.z_arr) - for nz in self.nz_list]) / self.n_gals + for nz in self.nz_list]) / self.n_gals self.n_dens = self.n_gals / (4*np.pi*self.fsky) self.cls = None self.use_clustering = use_clustering @@ -57,9 +56,8 @@ def _bz_model(self, cosmo, z): def get_cl_matrix(self, fname_save=None, recompute=False): """ Computes matrix of power spectra between all the initial groups. - Args: - fname_save (string): if not None, the result will be saved to a + fname_save (string): if not None, the result will be saved to a file by this name, and if present and `recompute=False`, the matrix will be read from that file. recompute (bool): if `True`, any previous calculations of the @@ -72,20 +70,23 @@ def get_cl_matrix(self, fname_save=None, recompute=False): return # Cosmology - cosmo = ccl.CosmologyVanillaLCDM() + cosmo = ccl.Cosmology(Omega_c=0.25, + Omega_b=0.05, + h=0.67, n_s=0.96, + sigma8=0.81) # Tracers if self.use_clustering: trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz), - bias=(self.z_arr, - self._bz_model(cosmo, - self.z_arr))) - for nz in self.nz_list] + bias=(self.z_arr, + self._bz_model(cosmo, + self.z_arr))) + for nz in self.nz_list] else: trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz)) - for nz in self.nz_list] + for nz in self.nz_list] - # Cls + # Cls self.cls = np.zeros([self.n_ell, self.n_samples, self.n_samples]) for i in range(self.n_samples): for j in range(i, self.n_samples): @@ -100,9 +101,8 @@ def get_cl_matrix(self, fname_save=None, recompute=False): def get_resample_metadata(self, assign): """ Transform an assignment list into a weight matrix and the corresponding number densities. - Args: - assign (list): list of arrays. Each element of the list should + assign (list): list of arrays. Each element of the list should be a numpy array of integers, corresponding to the indices of the initial groups that are now resampled into a larger group. @@ -136,9 +136,8 @@ def _get_nl_resamples(self, n_dens): def get_sn_wn(self, weights, n_dens, full_output=False): """ Compute signal-to-noise ratio from weights matrix and number densities of the resampled groups. - Args: - weights (array_like): weights matrix of shape + weights (array_like): weights matrix of shape `(N_resample, N_initial)`, where `N_resample` is the number of new groups, and `N_initial` is the original number of groups. Each entry corresponds to the weight @@ -148,9 +147,8 @@ def get_sn_wn(self, weights, n_dens, full_output=False): new groups. full_output (bool): if true, a dictionary with additional information will be returned. Otherwise just total S/N. - Returns: - If `full_output=True`, dictionary containing S/N, power + If `full_output=True`, dictionary containing S/N, power spectra and noise spectra. Otherwise just S/N. """ sl = self._get_cl_resamples(weights) @@ -159,10 +157,10 @@ def get_sn_wn(self, weights, n_dens, full_output=False): icl = np.linalg.inv(cl) sn2_1pt = np.sum(sl[:, :, :, None] * icl[:, None, :, :], axis=2) sn2_ell = np.sum(sn2_1pt[:, :, :, None] * sn2_1pt[:, None, :, :], - axis=2) + axis=2) trsn2_ell = np.trace(sn2_ell, axis1=1, axis2=2) snr = np.sqrt(np.sum(trsn2_ell * (2*self.larr + 1) * - self.d_ell / self.fsky)) + self.d_ell / self.fsky)) if full_output: dret = {'snr': snr, 'cl': sl, 'nl': nl, 'ls': self.larr} @@ -172,40 +170,36 @@ def get_sn_wn(self, weights, n_dens, full_output=False): def get_sn(self, assign, full_output=False): """ Compute signal-to-noise ratio from a bin assignment list. - Args: - assign (list): list of arrays. Each element of the list should + assign (list): list of arrays. Each element of the list should be a numpy array of integers, corresponding to the indices of the initial groups that are now resampled into a larger group. full_output (bool): if true, a dictionary with additional information will be returned. Otherwise just total S/N. - Returns: - If `full_output=True`, dictionary containing S/N, power + If `full_output=True`, dictionary containing S/N, power spectra and noise spectra. Otherwise just S/N. """ w, ndens = self.get_resample_metadata(assign) return self.get_sn_wn(w, ndens, - full_output=full_output) + full_output=full_output) def check_edges(self, edges): """ Returns `True` if there's something wrong with the edges. """ return np.any(edges < 0) or \ - np.any(edges > self.edges_large) or \ - np.any(np.diff(edges) < 0) + np.any(edges > self.edges_large) or \ + np.any(np.diff(edges) < 0) def assign_from_edges(self, edges, assign_params=assign_params_default): """ Get assignment list from edges and assign parameters. - Args: - edges (array_like): array of bin edges. + edges (array_like): array of bin edges. assign_params (dict): dictionary of assignment parameters (see `assign_params_default`). - Returns: - List of assignment arrays ready to be used in e.g. `get_sn`. + List of assignment arrays ready to be used in e.g. `get_sn`. """ nbins = len(edges) + 1 # Bin IDs based on mean z @@ -217,40 +211,38 @@ def assign_from_edges(self, edges, assign_params=assign_params_default): if assign_params['use_p_outbin']: # Matrix of probabilities in each bin pzs = np.array([[simps(pz[m], x=self.z_arr[m]) - for m in masks] - for pz in self.pz_list]) + for m in masks] + for pz in self.pz_list]) pzd = np.array([pzs[j, ids[j]] - for j in range(self.n_samples)]) + for j in range(self.n_samples)]) else: # Overlaps in own bin pzd = np.array([simps(pz[masks[i]], - x=self.z_arr[masks[i]]) - for pz, i in zip(self.pz_list, ids)]) + x=self.z_arr[masks[i]]) + for pz, i in zip(self.pz_list, ids)]) - # Throw away based on in-bin probability + # Throw away based on in-bin probability if assign_params['use_p_inbin']: ids[pzd < assign_params['p_inbin_thr']] = -1 # Throw away based on off-bin probability if assign_params['use_p_outbin']: ids[np.array([np.sum(p > assign_params['p_outbin_thr']) > 1 - for p in pzs])] = -1 - # Assignment list + for p in pzs])] = -1 + # Assignment list return [np.where(ids == i)[0] for i in np.unique(ids)] def get_sn_from_edges(self, edges, full_output=False, - assign_params=assign_params_default): + assign_params=assign_params_default): """ Compute signal-to-noise ratio from a set of edges and assignment parameters. - Args: - edges (array_like): array of bin edges. + edges (array_like): array of bin edges. assign_params (dict): dictionary of assignment parameters (see `assign_params_default`). full_output (bool): if true, a dictionary with additional information will be returned. Otherwise just total S/N. - Returns: - If `full_output=True`, dictionary containing S/N, power + If `full_output=True`, dictionary containing S/N, power spectra and noise spectra. Otherwise just S/N. """ if self.check_edges(edges): From 0223629455a150435c03ea7c0fd00be609f584ff Mon Sep 17 00:00:00 2001 From: Bela Abolfathi Date: Fri, 28 Aug 2020 15:51:03 -0700 Subject: [PATCH 06/35] Add updated complex classifier --- tomo_challenge/classifiers/ComplexSOM.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py index 501617e8..c122a928 100644 --- a/tomo_challenge/classifiers/ComplexSOM.py +++ b/tomo_challenge/classifiers/ComplexSOM.py @@ -12,6 +12,7 @@ See Classifier Documentation below. """ +import os from .base import Tomographer import numpy as np import rpy2 @@ -40,12 +41,12 @@ utils.install_packages(StrVector(names_to_install)) base.Sys_setenv(TAR=base.system("which tar",intern=True)) devtools=ro.packages.importr("devtools") -devtools.install_github("AngusWright/helpRfuncs") -devtools.install_github("AngusWright/kohonen/kohonen") +#devtools.install_github("AngusWright/helpRfuncs") +#devtools.install_github("AngusWright/kohonen/kohonen") kohonen=ro.packages.importr("kohonen") class ComplexSOM(Tomographer): - """ Simplistic SOM Classifier """ + """ Complex SOM Classifier with SNR Optimisation """ # valid parameter -- see below valid_options = ['bins','som_dim','num_groups','num_threads', @@ -258,16 +259,23 @@ def train (self, training_data, training_z): #Get the group assignments for the training data train_group = np.array(train_som.rx2['clust.classif']) tab=base.table(FactorVector(train_som.rx2['clust.classif'],levels=base.seq(num_groups))) + print(tab) + print(np.arange(num_groups)[np.array(tab)==0]) #Construct the Nz z-axis - z_arr = np.linspace(0, 2, 1024) + z_arr = np.linspace(0, 2, 124) z_cen = z_arr[0:-1]+(z_arr[1]-z_arr[0])/2. #Construct the per-group Nz nzs = [(np.histogram(training_z[train_group == group+1],z_arr)[0]) for group in np.arange(num_groups)[np.array(tab)!=0]] + #Update the fsky + n_gal = np.sum(np.array(tab)) # Total number of galaxies you have, change appropriately + n_eff = 20. # Target density in arcmin^-2 + fsky = n_gal / (4*np.pi*(180 * 60 / np.pi)**2 * n_eff) # Now initialize a S/N calculator for these initial groups. + os.system('rm wl_nb%d.npz' % n_bin) if metric == 'SNR_ww': - c_wl = SnCalc(z_cen, nzs, use_clustering=False) + c_wl = SnCalc(z_cen, nzs, use_clustering=False,fsky=fsky) else: - c_wl = SnCalc(z_cen, nzs, use_clustering=True) + c_wl = SnCalc(z_cen, nzs, use_clustering=True,fsky=fsky) print("Initializing WL") c_wl.get_cl_matrix(fname_save='wl_nb%d.npz' % n_bin) # Finally, let's write the function to minimize and optimize for a 4-bin case. @@ -278,8 +286,9 @@ def minus_sn(edges, calc): edges_0 = np.linspace(0, 2, n_bin-1) res = minimize(minus_sn, edges_0, method='Powell', args=(c_wl,)) print("WL final edges: ", res.x) - print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x)) + print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x)*np.sqrt(0.25/fsky)) print(" ") + print(res) From a6b069d3dd97f2651911d9920dc0211b2fa0cd54 Mon Sep 17 00:00:00 2001 From: Andrina Nicola Date: Sun, 30 Aug 2020 11:06:21 -0700 Subject: [PATCH 07/35] Improved interface between snc and apply. --- the_atlantics/snc.py | 7 +- tomo_challenge/classifiers/ComplexSOM.py | 30 ++- tomo_challenge/classifiers/SimpleSOM.py | 4 +- tomo_challenge/snc.py | 254 +++++++++++++++++++++++ 4 files changed, 273 insertions(+), 22 deletions(-) create mode 100644 tomo_challenge/snc.py diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py index 27aa0c09..0574acee 100644 --- a/the_atlantics/snc.py +++ b/the_atlantics/snc.py @@ -192,7 +192,7 @@ def check_edges(self, edges): np.any(edges > self.edges_large) or \ np.any(np.diff(edges) < 0) - def assign_from_edges(self, edges, assign_params=assign_params_default): + def assign_from_edges(self, edges, assign_params=assign_params_default, return_bins=False): """ Get assignment list from edges and assign parameters. Args: edges (array_like): array of bin edges. @@ -229,7 +229,10 @@ def assign_from_edges(self, edges, assign_params=assign_params_default): ids[np.array([np.sum(p > assign_params['p_outbin_thr']) > 1 for p in pzs])] = -1 # Assignment list - return [np.where(ids == i)[0] for i in np.unique(ids)] + if return_bins: + return [np.where(ids == i)[0] for i in np.unique(ids)], np.unique(ids) + else: + return [np.where(ids == i)[0] for i in np.unique(ids)] def get_sn_from_edges(self, edges, full_output=False, assign_params=assign_params_default): diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py index c122a928..37211d73 100644 --- a/tomo_challenge/classifiers/ComplexSOM.py +++ b/tomo_challenge/classifiers/ComplexSOM.py @@ -266,6 +266,9 @@ def train (self, training_data, training_z): z_cen = z_arr[0:-1]+(z_arr[1]-z_arr[0])/2. #Construct the per-group Nz nzs = [(np.histogram(training_z[train_group == group+1],z_arr)[0]) for group in np.arange(num_groups)[np.array(tab)!=0]] + + # np.save('nzs.npy', nzs) + #Update the fsky n_gal = np.sum(np.array(tab)) # Total number of galaxies you have, change appropriately n_eff = 20. # Target density in arcmin^-2 @@ -290,15 +293,20 @@ def minus_sn(edges, calc): print(" ") print(res) - + groups_in_tomo, tomo_bins = c_wl.assign_from_edges(res.x, return_bins=True) + group_bins = np.zeros(num_groups) + for i, bin_no in enumerate(tomo_bins): + print(i) + print(bin_no) + group_bins[groups_in_tomo[i]] = bin_no + print(group_bins) #Construct the per-group Nz print("Outputting trained SOM and redshift ordering of SOM groups") print("Finished") self.train_som = train_som - self.group_z = group_z - self.z_order = z_order + self.group_bins = group_bins def apply (self, data): """Applies training to the data. @@ -333,26 +341,12 @@ def apply (self, data): expression="nrow(data)",expr_label="N", n_cluster_bins=self.opt['num_groups'],n_cores=self.opt['num_threads']) - #Calculate the cumulative count - print("Generate cumulative source counts a.f.o. group mean z") - zcumsum=base.cumsum(group_prop.rx2('property').rx(self.z_order)) - - # Find the edges that split the redshifts into n_z bins of - # equal number counts in each - print("Assign the groups to tomographic bins") - p = np.linspace(0, 100, n_bin + 1) - n_edges = np.percentile(zcumsum, p) - - # Now put the groups into redshift bins. - group_bins = FloatVector(base.cut(FloatVector(zcumsum),FloatVector(n_edges), - include=True)).rx(base.order(self.z_order)) - #extract the validation som (just for convenience) valid_som = group_prop.rx2('som') #Assign the sources, by group, to tomographic bins print("Output source tomographic bin assignments") - valid_bin = base.unsplit(group_bins,FactorVector(valid_som.rx2['clust.classif'], + valid_bin = base.unsplit(self.group_bins,FactorVector(valid_som.rx2['clust.classif'], levels=base.seq(self.opt['num_groups'])),drop=False) #valid_bin = valid_som.rx2['clust.classif'] valid_bin = np.array(valid_bin)-1 diff --git a/tomo_challenge/classifiers/SimpleSOM.py b/tomo_challenge/classifiers/SimpleSOM.py index f3b0b74c..43d7ed98 100644 --- a/tomo_challenge/classifiers/SimpleSOM.py +++ b/tomo_challenge/classifiers/SimpleSOM.py @@ -37,8 +37,8 @@ utils.install_packages(StrVector(names_to_install)) base.Sys_setenv(TAR=base.system("which tar",intern=True)) devtools=ro.packages.importr("devtools") -devtools.install_github("AngusWright/helpRfuncs") -devtools.install_github("AngusWright/kohonen/kohonen") +#devtools.install_github("AngusWright/helpRfuncs") +#devtools.install_github("AngusWright/kohonen/kohonen") kohonen=ro.packages.importr("kohonen") class SimpleSOM(Tomographer): diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py new file mode 100644 index 00000000..0574acee --- /dev/null +++ b/tomo_challenge/snc.py @@ -0,0 +1,254 @@ +import numpy as np +import pyccl as ccl +from scipy.integrate import simps +import os + + +assign_params_default = {'p_inbin_thr': 0.5, + 'p_outbin_thr': 0.2, + 'use_p_inbin': False, + 'use_p_outbin': False} + + +class SnCalc(object): + edges_large = 100. + + def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, + s_gamma=0.28, use_clustering=False): + """ S/N calculator + Args: + z_arr (array_like): array of redshifts at which all the N(z)s are + sampled. They should be linearly spaced. + nz_list (list): list of arrays containing the redshift + distributions of all the initial groups (each array in the + list corresponds to one group). The arrays should contain the + redshift distribution sampled at `z_arr`, with its integral + over `z_arr` equal to the number of galaxies in that group. + fsky (float): sky fraction (for SNR calculation and for + transforming number of galaxies into number densities). + lmax (float): maximum ell. + d_ell (float): ell bandpower width. + s_gamma (float): rms ellipticity scatter (relevant if + `use_clustering=False`). + use_clustering (bool): if `True`, SNR will be computed for + clustering instead of lensing. + """ + self.s_gamma = s_gamma + self.fsky = fsky + self.lmax = lmax + self.d_ell = d_ell + self.larr = np.arange(2, lmax, d_ell)+d_ell/2 + self.n_ell = len(self.larr) + self.n_samples = len(nz_list) + self.z_arr = z_arr + self.nz_list = nz_list + self.n_gals = np.array([simps(nz, x=self.z_arr) + for nz in self.nz_list]) + self.pz_list = self.nz_list / self.n_gals[:, None] + self.z_means = np.array([simps(self.z_arr*nz, x=self.z_arr) + for nz in self.nz_list]) / self.n_gals + self.n_dens = self.n_gals / (4*np.pi*self.fsky) + self.cls = None + self.use_clustering = use_clustering + + def _bz_model(self, cosmo, z): + return 0.95/ccl.growth_factor(cosmo, 1./(1+z)) + + def get_cl_matrix(self, fname_save=None, recompute=False): + """ Computes matrix of power spectra between all the initial groups. + Args: + fname_save (string): if not None, the result will be saved to a + file by this name, and if present and `recompute=False`, the + matrix will be read from that file. + recompute (bool): if `True`, any previous calculations of the + C_ell matrix stored on file will be ignored, and the matrix + will be recomputed. + """ + if fname_save is not None: + if os.path.isfile(fname_save) and not recompute: + self.cls = np.load(fname_save)['cls'] + return + + # Cosmology + cosmo = ccl.Cosmology(Omega_c=0.25, + Omega_b=0.05, + h=0.67, n_s=0.96, + sigma8=0.81) + + # Tracers + if self.use_clustering: + trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz), + bias=(self.z_arr, + self._bz_model(cosmo, + self.z_arr))) + for nz in self.nz_list] + else: + trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz)) + for nz in self.nz_list] + + # Cls + self.cls = np.zeros([self.n_ell, self.n_samples, self.n_samples]) + for i in range(self.n_samples): + for j in range(i, self.n_samples): + cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr) + self.cls[:, i, j] = cl + if j != i: + self.cls[:, j, i] = cl + + if fname_save is not None: + np.savez(fname_save, cls=self.cls) + + def get_resample_metadata(self, assign): + """ Transform an assignment list into a weight matrix and the + corresponding number densities. + Args: + assign (list): list of arrays. Each element of the list should + be a numpy array of integers, corresponding to the indices + of the initial groups that are now resampled into a larger + group. + """ + n_resamples = len(assign) + weight_res = np.zeros([n_resamples, self.n_samples]) + n_dens_res = np.zeros(n_resamples) + for ia, a in enumerate(assign): + ndens = self.n_dens[a] + ndens_tot = np.sum(ndens) + n_dens_res[ia] = ndens_tot + weight_res[ia, a] = ndens / ndens_tot + return weight_res, n_dens_res + + def _get_cl_resamples(self, weights): + """ Gets C_ell matrix for resampled groups from weights matrix. + """ + if self.cls is None: + self.get_cl_matrix() + return np.einsum('jl,km,ilm', weights, weights, self.cls) + + def _get_nl_resamples(self, n_dens): + """ Gets noise contribution to C_ell matrix for resampled groups + from list of number densities. + """ + if self.use_clustering: + return np.diag(1./n_dens)[None, :, :] + else: + return np.diag(self.s_gamma**2/n_dens)[None, :, :] + + def get_sn_wn(self, weights, n_dens, full_output=False): + """ Compute signal-to-noise ratio from weights matrix and number + densities of the resampled groups. + Args: + weights (array_like): weights matrix of shape + `(N_resample, N_initial)`, where `N_resample` is the + number of new groups, and `N_initial` is the original + number of groups. Each entry corresponds to the weight + which with a given initial group enters the new set of + groups. + n_dens (array_like): number density (in sterad^-1) of the + new groups. + full_output (bool): if true, a dictionary with additional + information will be returned. Otherwise just total S/N. + Returns: + If `full_output=True`, dictionary containing S/N, power + spectra and noise spectra. Otherwise just S/N. + """ + sl = self._get_cl_resamples(weights) + nl = self._get_nl_resamples(n_dens) + cl = sl + nl + icl = np.linalg.inv(cl) + sn2_1pt = np.sum(sl[:, :, :, None] * icl[:, None, :, :], axis=2) + sn2_ell = np.sum(sn2_1pt[:, :, :, None] * sn2_1pt[:, None, :, :], + axis=2) + trsn2_ell = np.trace(sn2_ell, axis1=1, axis2=2) + snr = np.sqrt(np.sum(trsn2_ell * (2*self.larr + 1) * + self.d_ell / self.fsky)) + + if full_output: + dret = {'snr': snr, 'cl': sl, 'nl': nl, 'ls': self.larr} + return dret + else: + return snr + + def get_sn(self, assign, full_output=False): + """ Compute signal-to-noise ratio from a bin assignment list. + Args: + assign (list): list of arrays. Each element of the list should + be a numpy array of integers, corresponding to the indices + of the initial groups that are now resampled into a larger + group. + full_output (bool): if true, a dictionary with additional + information will be returned. Otherwise just total S/N. + Returns: + If `full_output=True`, dictionary containing S/N, power + spectra and noise spectra. Otherwise just S/N. + """ + w, ndens = self.get_resample_metadata(assign) + return self.get_sn_wn(w, ndens, + full_output=full_output) + + def check_edges(self, edges): + """ Returns `True` if there's something wrong with the edges. + """ + return np.any(edges < 0) or \ + np.any(edges > self.edges_large) or \ + np.any(np.diff(edges) < 0) + + def assign_from_edges(self, edges, assign_params=assign_params_default, return_bins=False): + """ Get assignment list from edges and assign parameters. + Args: + edges (array_like): array of bin edges. + assign_params (dict): dictionary of assignment parameters + (see `assign_params_default`). + Returns: + List of assignment arrays ready to be used in e.g. `get_sn`. + """ + nbins = len(edges) + 1 + # Bin IDs based on mean z + ids = np.digitize(self.z_means, bins=edges) + if assign_params['use_p_inbin'] or assign_params['use_p_outbin']: + # Calculate probabilities in each bin + zgroups = np.digitize(self.z_arr, bins=edges) + masks = [zgroups == i for i in range(nbins)] + if assign_params['use_p_outbin']: + # Matrix of probabilities in each bin + pzs = np.array([[simps(pz[m], x=self.z_arr[m]) + for m in masks] + for pz in self.pz_list]) + pzd = np.array([pzs[j, ids[j]] + for j in range(self.n_samples)]) + else: + # Overlaps in own bin + pzd = np.array([simps(pz[masks[i]], + x=self.z_arr[masks[i]]) + for pz, i in zip(self.pz_list, ids)]) + + # Throw away based on in-bin probability + if assign_params['use_p_inbin']: + ids[pzd < assign_params['p_inbin_thr']] = -1 + # Throw away based on off-bin probability + if assign_params['use_p_outbin']: + ids[np.array([np.sum(p > assign_params['p_outbin_thr']) > 1 + for p in pzs])] = -1 + # Assignment list + if return_bins: + return [np.where(ids == i)[0] for i in np.unique(ids)], np.unique(ids) + else: + return [np.where(ids == i)[0] for i in np.unique(ids)] + + def get_sn_from_edges(self, edges, full_output=False, + assign_params=assign_params_default): + """ Compute signal-to-noise ratio from a set of edges and assignment + parameters. + Args: + edges (array_like): array of bin edges. + assign_params (dict): dictionary of assignment parameters + (see `assign_params_default`). + full_output (bool): if true, a dictionary with additional + information will be returned. Otherwise just total S/N. + Returns: + If `full_output=True`, dictionary containing S/N, power + spectra and noise spectra. Otherwise just S/N. + """ + if self.check_edges(edges): + return 0 + assign = self.assign_from_edges(edges, assign_params=assign_params) + return self.get_sn(assign, full_output=full_output) From 7ed22282b060e2688d14d90dabb737e380e27bf9 Mon Sep 17 00:00:00 2001 From: David Alonso Date: Sun, 30 Aug 2020 19:17:53 +0100 Subject: [PATCH 08/35] optimizer with all options --- the_atlantics/optimizer.py | 111 +++++++++++++++++++++++ the_atlantics/snc.py | 49 ++++++---- tomo_challenge/classifiers/ComplexSOM.py | 7 +- tomo_challenge/snc.py | 49 ++++++---- 4 files changed, 176 insertions(+), 40 deletions(-) create mode 100644 the_atlantics/optimizer.py diff --git a/the_atlantics/optimizer.py b/the_atlantics/optimizer.py new file mode 100644 index 00000000..9369a28e --- /dev/null +++ b/the_atlantics/optimizer.py @@ -0,0 +1,111 @@ +from argparse import ArgumentParser +import numpy as np +from snc import SnCalc, assign_params_default +from scipy.interpolate import interp1d +from scipy.optimize import minimize, brentq +from scipy.integrate import simps +import matplotlib.pyplot as plt + + +parser = ArgumentParser() +parser.add_argument("--nz-file", default="nzs.npy", type=str, + help="Path to input file containing the N(z) of all initial groups") +parser.add_argument("--prefix-out", default='out', type=str, + help="Prefix for all output files") +parser.add_argument("--n-bins", default=4, type=int, + help="Number of bins (excluding a possible trash bin") +parser.add_argument("--prob-in-bin", default=-1., type=float, + help="Minimum fraction of the N(z) that should be inside the bin." + "If <= 0, this will not be taken into account in the optimization.") +parser.add_argument("--prob-out-bin", default=-1., type=float, + help="Maximum fraction of the N(z) that should be inside other bins." + "If <= 0, this will not be taken into account in the optimization.") +o = parser.parse_args() + +nzs = np.load(o.nz_file) +# We should probably read z on input too +z_arr = np.linspace(0, 2, 124) +zz = (z_arr[1:]+z_arr[:-1])*0.5 +# Renormalize to use the same sample definitions as the +# official metric calculators. +fsky = 0.25 +ndens_arcmin = 25. +area_arcmin = (180*60/np.pi)**2*4*np.pi*fsky +ng_tot = ndens_arcmin * area_arcmin +ng_each_small = simps(nzs, x=zz) +ng_tot_small = np.sum(ng_each_small) +ng_each = ng_each_small * ng_tot / ng_tot_small +nzs = nzs * ng_tot / ng_tot_small + +# Initialize calculator +sc = SnCalc(zz, nzs, fsky=fsky) +sc.get_cl_matrix(fname_save=o.prefix_out + 'cl_wl.npz') + +# Initial edge guess (defined as having equal number of galaxies) +nz_tot = np.sum(nzs, axis=0) +cumulative_fraction = np.cumsum(nz_tot) / np.sum(nz_tot) +cumul_f = interp1d(zz, cumulative_fraction, bounds_error=False, + fill_value=(0, 1)) +edges_0 = np.array([brentq(lambda z : cumul_f(z) - q, 0, 2) + for q in (np.arange(o.n_bins-1)+1.)/o.n_bins]) + + +# Minimize +# Binning parameters +params = assign_params_default.copy() +if o.prob_in_bin > 0: + params['p_inbin_thr'] = o.prob_in_bin + params['use_p_inbin'] = True +if o.prob_out_bin > 0: + params['p_outbin_thr'] = o.prob_out_bin + params['use_p_outbin'] = True + + +def minus_sn(edges, calc): + return -calc.get_sn_from_edges(edges, assign_params=params) + + +# Actual optimization +res = minimize(minus_sn, edges_0, method='Powell', args=(sc,)) +edges_1 = res.x + +# Post-process: +# S/N +sn = sc.get_sn_from_edges(edges_1, assign_params=params) +# N(z)s of the final bins +nz_best = sc.get_nzs_from_edges(edges_1, assign_params=params) +# Group assignment +assign = sc.assign_from_edges(edges_1, assign_params=params, + get_ids=True) + + +def get_bin_name(i): + if i == -1: + return 'bin_trash' + else: + return 'bin_%d' % i + +print("Redshift edges: ", edges_1) +print("Optimal S/N: ", sn) +n_tot = simps(np.sum(nz_best, axis=0), + x=zz) + +d_out = {'sn': sn, + 'edges': edges_1} +plt.figure() +for (i, a), n in zip(assign, nz_best): + name = get_bin_name(i) + d_out[name+'_groups'] = a + d_out[name+'_nz'] = n + print("Bin %d, groups: " % i, a) + n_here = simps(n, x=zz) + f = n_here / n_tot + frac = "%.1lf" % (100 * f) + plt.plot(zz, n/n_here, + label=f'Bin {i} ({frac} %)') +plt.legend() +plt.xlabel(r'$z$', fontsize=15) +plt.ylabel(r'$p(z)$', fontsize=15) +plt.show() + +np.savez(o.prefix_out + '_bin_info.npz', **d_out) diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py index 0574acee..f009115f 100644 --- a/the_atlantics/snc.py +++ b/the_atlantics/snc.py @@ -17,7 +17,7 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, s_gamma=0.28, use_clustering=False): """ S/N calculator Args: - z_arr (array_like): array of redshifts at which all the N(z)s are + z_arr (array_like): array of redshifts at which all the N(z)s are sampled. They should be linearly spaced. nz_list (list): list of arrays containing the redshift distributions of all the initial groups (each array in the @@ -43,10 +43,10 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, self.z_arr = z_arr self.nz_list = nz_list self.n_gals = np.array([simps(nz, x=self.z_arr) - for nz in self.nz_list]) + for nz in self.nz_list]) self.pz_list = self.nz_list / self.n_gals[:, None] self.z_means = np.array([simps(self.z_arr*nz, x=self.z_arr) - for nz in self.nz_list]) / self.n_gals + for nz in self.nz_list]) / self.n_gals self.n_dens = self.n_gals / (4*np.pi*self.fsky) self.cls = None self.use_clustering = use_clustering @@ -102,7 +102,7 @@ def get_resample_metadata(self, assign): """ Transform an assignment list into a weight matrix and the corresponding number densities. Args: - assign (list): list of arrays. Each element of the list should + assign (list): list of arrays. Each element of the list should be a numpy array of integers, corresponding to the indices of the initial groups that are now resampled into a larger group. @@ -192,14 +192,24 @@ def check_edges(self, edges): np.any(edges > self.edges_large) or \ np.any(np.diff(edges) < 0) - def assign_from_edges(self, edges, assign_params=assign_params_default, return_bins=False): + def get_nzs_from_edges(self, edges, assign_params=assign_params_default): + assign = self.assign_from_edges(edges, assign_params=assign_params) + nzs = np.array([np.sum(self.nz_list[a, :], axis=0) + for a in assign]) + return nzs + + def assign_from_edges(self, edges, assign_params=assign_params_default, + get_ids=False): """ Get assignment list from edges and assign parameters. Args: - edges (array_like): array of bin edges. + edges (array_like): array of bin edges. assign_params (dict): dictionary of assignment parameters (see `assign_params_default`). + get_ids (bool): if `True`, output assignment arrays will + be accompanied by the associated bin id (with -1 for the + trash bin). Returns: - List of assignment arrays ready to be used in e.g. `get_sn`. + List of assignment arrays ready to be used in e.g. `get_sn`. """ nbins = len(edges) + 1 # Bin IDs based on mean z @@ -211,17 +221,20 @@ def assign_from_edges(self, edges, assign_params=assign_params_default, return_b if assign_params['use_p_outbin']: # Matrix of probabilities in each bin pzs = np.array([[simps(pz[m], x=self.z_arr[m]) - for m in masks] - for pz in self.pz_list]) + for m in masks] + for pz in self.pz_list]) pzd = np.array([pzs[j, ids[j]] - for j in range(self.n_samples)]) + for j in range(self.n_samples)]) else: # Overlaps in own bin - pzd = np.array([simps(pz[masks[i]], - x=self.z_arr[masks[i]]) - for pz, i in zip(self.pz_list, ids)]) - - # Throw away based on in-bin probability + def integrate_safe(p, z, m): + if np.sum(m) == 0: + return 0 + else: + return simps(p[m], x=z[m]) + pzd = np.array([integrate_safe(pz, self.z_arr, masks[i]) + for pz, i in zip(self.pz_list, ids)]) + # Throw away based on in-bin probability if assign_params['use_p_inbin']: ids[pzd < assign_params['p_inbin_thr']] = -1 # Throw away based on off-bin probability @@ -229,13 +242,13 @@ def assign_from_edges(self, edges, assign_params=assign_params_default, return_b ids[np.array([np.sum(p > assign_params['p_outbin_thr']) > 1 for p in pzs])] = -1 # Assignment list - if return_bins: - return [np.where(ids == i)[0] for i in np.unique(ids)], np.unique(ids) + if get_ids: + return [(i, np.where(ids == i)[0]) for i in np.unique(ids)] else: return [np.where(ids == i)[0] for i in np.unique(ids)] def get_sn_from_edges(self, edges, full_output=False, - assign_params=assign_params_default): + assign_params=assign_params_default): """ Compute signal-to-noise ratio from a set of edges and assignment parameters. Args: diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py index 37211d73..5c0fbb42 100644 --- a/tomo_challenge/classifiers/ComplexSOM.py +++ b/tomo_challenge/classifiers/ComplexSOM.py @@ -293,12 +293,11 @@ def minus_sn(edges, calc): print(" ") print(res) - groups_in_tomo, tomo_bins = c_wl.assign_from_edges(res.x, return_bins=True) + groups_in_tomo = c_wl.assign_from_edges(res.x, get_ids=True) group_bins = np.zeros(num_groups) - for i, bin_no in enumerate(tomo_bins): - print(i) + for bin_no, groups in enumerate(groups_in_tomo): print(bin_no) - group_bins[groups_in_tomo[i]] = bin_no + group_bins[groups] = bin_no print(group_bins) #Construct the per-group Nz diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py index 0574acee..f009115f 100644 --- a/tomo_challenge/snc.py +++ b/tomo_challenge/snc.py @@ -17,7 +17,7 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, s_gamma=0.28, use_clustering=False): """ S/N calculator Args: - z_arr (array_like): array of redshifts at which all the N(z)s are + z_arr (array_like): array of redshifts at which all the N(z)s are sampled. They should be linearly spaced. nz_list (list): list of arrays containing the redshift distributions of all the initial groups (each array in the @@ -43,10 +43,10 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, self.z_arr = z_arr self.nz_list = nz_list self.n_gals = np.array([simps(nz, x=self.z_arr) - for nz in self.nz_list]) + for nz in self.nz_list]) self.pz_list = self.nz_list / self.n_gals[:, None] self.z_means = np.array([simps(self.z_arr*nz, x=self.z_arr) - for nz in self.nz_list]) / self.n_gals + for nz in self.nz_list]) / self.n_gals self.n_dens = self.n_gals / (4*np.pi*self.fsky) self.cls = None self.use_clustering = use_clustering @@ -102,7 +102,7 @@ def get_resample_metadata(self, assign): """ Transform an assignment list into a weight matrix and the corresponding number densities. Args: - assign (list): list of arrays. Each element of the list should + assign (list): list of arrays. Each element of the list should be a numpy array of integers, corresponding to the indices of the initial groups that are now resampled into a larger group. @@ -192,14 +192,24 @@ def check_edges(self, edges): np.any(edges > self.edges_large) or \ np.any(np.diff(edges) < 0) - def assign_from_edges(self, edges, assign_params=assign_params_default, return_bins=False): + def get_nzs_from_edges(self, edges, assign_params=assign_params_default): + assign = self.assign_from_edges(edges, assign_params=assign_params) + nzs = np.array([np.sum(self.nz_list[a, :], axis=0) + for a in assign]) + return nzs + + def assign_from_edges(self, edges, assign_params=assign_params_default, + get_ids=False): """ Get assignment list from edges and assign parameters. Args: - edges (array_like): array of bin edges. + edges (array_like): array of bin edges. assign_params (dict): dictionary of assignment parameters (see `assign_params_default`). + get_ids (bool): if `True`, output assignment arrays will + be accompanied by the associated bin id (with -1 for the + trash bin). Returns: - List of assignment arrays ready to be used in e.g. `get_sn`. + List of assignment arrays ready to be used in e.g. `get_sn`. """ nbins = len(edges) + 1 # Bin IDs based on mean z @@ -211,17 +221,20 @@ def assign_from_edges(self, edges, assign_params=assign_params_default, return_b if assign_params['use_p_outbin']: # Matrix of probabilities in each bin pzs = np.array([[simps(pz[m], x=self.z_arr[m]) - for m in masks] - for pz in self.pz_list]) + for m in masks] + for pz in self.pz_list]) pzd = np.array([pzs[j, ids[j]] - for j in range(self.n_samples)]) + for j in range(self.n_samples)]) else: # Overlaps in own bin - pzd = np.array([simps(pz[masks[i]], - x=self.z_arr[masks[i]]) - for pz, i in zip(self.pz_list, ids)]) - - # Throw away based on in-bin probability + def integrate_safe(p, z, m): + if np.sum(m) == 0: + return 0 + else: + return simps(p[m], x=z[m]) + pzd = np.array([integrate_safe(pz, self.z_arr, masks[i]) + for pz, i in zip(self.pz_list, ids)]) + # Throw away based on in-bin probability if assign_params['use_p_inbin']: ids[pzd < assign_params['p_inbin_thr']] = -1 # Throw away based on off-bin probability @@ -229,13 +242,13 @@ def assign_from_edges(self, edges, assign_params=assign_params_default, return_b ids[np.array([np.sum(p > assign_params['p_outbin_thr']) > 1 for p in pzs])] = -1 # Assignment list - if return_bins: - return [np.where(ids == i)[0] for i in np.unique(ids)], np.unique(ids) + if get_ids: + return [(i, np.where(ids == i)[0]) for i in np.unique(ids)] else: return [np.where(ids == i)[0] for i in np.unique(ids)] def get_sn_from_edges(self, edges, full_output=False, - assign_params=assign_params_default): + assign_params=assign_params_default): """ Compute signal-to-noise ratio from a set of edges and assignment parameters. Args: From 037a2b75419e95f897526a9b819366b3c0111982 Mon Sep 17 00:00:00 2001 From: David Alonso Date: Sun, 30 Aug 2020 19:20:58 +0100 Subject: [PATCH 09/35] 20 --- the_atlantics/optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/the_atlantics/optimizer.py b/the_atlantics/optimizer.py index 9369a28e..7fce42c4 100644 --- a/the_atlantics/optimizer.py +++ b/the_atlantics/optimizer.py @@ -29,7 +29,7 @@ # Renormalize to use the same sample definitions as the # official metric calculators. fsky = 0.25 -ndens_arcmin = 25. +ndens_arcmin = 20. area_arcmin = (180*60/np.pi)**2*4*np.pi*fsky ng_tot = ndens_arcmin * area_arcmin ng_each_small = simps(nzs, x=zz) From b159dca35172ec9f8f8b34297f3c0553eae127cf Mon Sep 17 00:00:00 2001 From: David Alonso Date: Sun, 30 Aug 2020 19:23:26 +0100 Subject: [PATCH 10/35] 0.28 --- the_atlantics/snc.py | 2 +- tomo_challenge/snc.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py index f009115f..1aca43c0 100644 --- a/the_atlantics/snc.py +++ b/the_atlantics/snc.py @@ -14,7 +14,7 @@ class SnCalc(object): edges_large = 100. def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, - s_gamma=0.28, use_clustering=False): + s_gamma=0.26, use_clustering=False): """ S/N calculator Args: z_arr (array_like): array of redshifts at which all the N(z)s are diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py index f009115f..1aca43c0 100644 --- a/tomo_challenge/snc.py +++ b/tomo_challenge/snc.py @@ -14,7 +14,7 @@ class SnCalc(object): edges_large = 100. def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, - s_gamma=0.28, use_clustering=False): + s_gamma=0.26, use_clustering=False): """ S/N calculator Args: z_arr (array_like): array of redshifts at which all the N(z)s are From f313e58c86c204245bbf04766c20fb77627b5b9d Mon Sep 17 00:00:00 2001 From: Andrina Nicola Date: Sun, 30 Aug 2020 12:21:45 -0700 Subject: [PATCH 11/35] Adapted fsky. --- tomo_challenge/classifiers/ComplexSOM.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py index 5c0fbb42..f0779992 100644 --- a/tomo_challenge/classifiers/ComplexSOM.py +++ b/tomo_challenge/classifiers/ComplexSOM.py @@ -269,10 +269,16 @@ def train (self, training_data, training_z): # np.save('nzs.npy', nzs) - #Update the fsky - n_gal = np.sum(np.array(tab)) # Total number of galaxies you have, change appropriately - n_eff = 20. # Target density in arcmin^-2 - fsky = n_gal / (4*np.pi*(180 * 60 / np.pi)**2 * n_eff) + #Update the fsky + # Renormalize to use the same sample definitions as the + # official metric calculators. + fsky = 0.25 + ndens_arcmin = 20. + area_arcmin = (180*60/np.pi)**2*4*np.pi*fsky + ng_tot_goal = ndens_arcmin * area_arcmin + ng_tot_curr = np.sum(np.array(tab)) + nzs = [nz * ng_tot_goal / ng_tot_curr for nz in nzs] + # Now initialize a S/N calculator for these initial groups. os.system('rm wl_nb%d.npz' % n_bin) if metric == 'SNR_ww': @@ -295,10 +301,9 @@ def minus_sn(edges, calc): groups_in_tomo = c_wl.assign_from_edges(res.x, get_ids=True) group_bins = np.zeros(num_groups) - for bin_no, groups in enumerate(groups_in_tomo): - print(bin_no) + for bin_no, groups in groups_in_tomo: + print(bin_no, groups) group_bins[groups] = bin_no - print(group_bins) #Construct the per-group Nz print("Outputting trained SOM and redshift ordering of SOM groups") From 2fa9e3f9a06b9ec86cc28e2fb4c0b8655e8fce1f Mon Sep 17 00:00:00 2001 From: David Alonso Date: Mon, 31 Aug 2020 11:12:01 +0100 Subject: [PATCH 12/35] Max edge at z=3 --- the_atlantics/snc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py index 1aca43c0..9d262eb9 100644 --- a/the_atlantics/snc.py +++ b/the_atlantics/snc.py @@ -11,7 +11,7 @@ class SnCalc(object): - edges_large = 100. + edges_large = 3. def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, s_gamma=0.26, use_clustering=False): From 6cd2cea50cc163b05ae7c10a6e698f1256ab7ef1 Mon Sep 17 00:00:00 2001 From: David Alonso Date: Mon, 31 Aug 2020 11:12:49 +0100 Subject: [PATCH 13/35] max edge at z=3 --- tomo_challenge/snc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py index 1aca43c0..9d262eb9 100644 --- a/tomo_challenge/snc.py +++ b/tomo_challenge/snc.py @@ -11,7 +11,7 @@ class SnCalc(object): - edges_large = 100. + edges_large = 3. def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, s_gamma=0.26, use_clustering=False): From 69cd49bb5019776555aaba3e0fcf9b8efc0985f9 Mon Sep 17 00:00:00 2001 From: Angus Wright Date: Mon, 31 Aug 2020 13:34:05 +0200 Subject: [PATCH 14/35] added .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..bc56b5a7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +plots/* + From b9decf24a4040b55828516efcc3b7a0fd754d26d Mon Sep 17 00:00:00 2001 From: Angus Wright Date: Mon, 31 Aug 2020 13:35:47 +0200 Subject: [PATCH 15/35] updated ComplexSOM code & yaml, improved grouping --- example/ComplexSOM.yaml | 6 +-- tomo_challenge/classifiers/ComplexSOM.py | 50 ++++++++++++++++-------- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/example/ComplexSOM.yaml b/example/ComplexSOM.yaml index 63a97e3e..e19130f4 100644 --- a/example/ComplexSOM.yaml +++ b/example/ComplexSOM.yaml @@ -2,7 +2,7 @@ metrics: [SNR_3x2] bands: riz training_file: data/training.hdf5 validation_file: data/validation_small.hdf5 -output_file: example/example_output.txt +output_file: example/ComplexSOM_output.txt # Backend implementing the metrics, either: "firecrown" (default), "jax-cosmo" metrics_impl: firecrown @@ -12,12 +12,12 @@ run: run_3: # These settings are sent to the classifier bins: 5 - som_dim: [21,21] + som_dim: [51,51] num_groups: 100 num_threads: 32 group_type: 'redshift' data_threshold: [0,30] - sparse_frac: 0.05 + sparse_frac: 0.15 plots: True plot_dir: 'plots' metric: "SNR_3x2" diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py index f0779992..ea37a806 100644 --- a/tomo_challenge/classifiers/ComplexSOM.py +++ b/tomo_challenge/classifiers/ComplexSOM.py @@ -120,9 +120,9 @@ def train (self, training_data, training_z): group_type = 'redshift' #Define the redshift summary statistics (used for making groups in the 'redshift' case - property_labels = ("mean_z_true","med_z_true","sd_z_true","mad_z_true","N","iqr_z_true") + property_labels = ("mean_z_true","med_z_true","sd_z_true","mad_z_true","iqr_z_true") property_expressions = ("mean(data$redshift_true)","median(data$redshift_true)","sd(data$redshift_true)", - "mad(data$redshift_true)","nrow(data)", + "mad(data$redshift_true)", "diff(quantile(data$redshift_true,probs=pnorm(c(-2,2))))") #Define the SOM variables if self.bands == 'riz': @@ -203,7 +203,10 @@ def train (self, training_data, training_z): expression=StrVector(property_expressions),expr_label=StrVector(property_labels)) print("Constructing redshift-based hierarchical cluster tree") #Cluster the SOM cells into num_groups groups - hclust=stats.hclust(stats.dist(cell_prop.rx2['property'])) + props = cell_prop.rx2['property'] + props.rx[base.which(base.is_na(props))] = -1 + print(base.summary(props)) + hclust=stats.hclust(stats.dist(props)) cell_group=stats.cutree(hclust,k=num_groups) #Assign the cell groups to the SOM structure @@ -222,6 +225,7 @@ def train (self, training_data, training_z): if plots == True: #Make the diagnostic plots props = group_prop.rx2('property') + cprops = cell_prop.rx2('property') print(base.colnames(props)) print("Constructing SOM diagnostic figures") #Initialise the plot device @@ -240,6 +244,15 @@ def train (self, training_data, training_z): #Paint by the redshift diagnostics: #mean redshift + gr.plot(train_som,property=cprops.rx(True,base.which(cprops.colnames.ro=='mean_z_true')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Cell mean redshift',zlim=FloatVector([0,1.8])) + #redshift std + gr.plot(train_som,property=cprops.rx(True,base.which(cprops.colnames.ro=='sd_z_true')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Cell redshift stdev',zlim=FloatVector([0,0.2])) + #2sigma redshift IQR + gr.plot(train_som,property=cprops.rx(True,base.which(cprops.colnames.ro=='iqr_z_true')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Cell 2sigma IQR',zlim=FloatVector([0,0.4])) + #mean redshift gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='mean_z_true')),ncolors=1e3,zlog=False, type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group mean redshift',zlim=FloatVector([0,1.8])) #redshift std @@ -247,10 +260,10 @@ def train (self, training_data, training_z): type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group redshift stdev',zlim=FloatVector([0,0.2])) #2sigma redshift IQR gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='iqr_z_true')),ncolors=1e3,zlog=False, - type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='group 2sigma IQR',zlim=FloatVector([0,0.4])) - #N group - gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='N')),ncolors=1e3,zlog=False, - type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='N group') + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group 2sigma IQR',zlim=FloatVector([0,0.4])) + ##N group + #gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='N')),ncolors=1e3,zlog=False, + # type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='N group') #Close the plot device dev.dev_off() @@ -267,7 +280,7 @@ def train (self, training_data, training_z): #Construct the per-group Nz nzs = [(np.histogram(training_z[train_group == group+1],z_arr)[0]) for group in np.arange(num_groups)[np.array(tab)!=0]] - # np.save('nzs.npy', nzs) + #np.save('plots/nzs.npy', nzs) #Update the fsky # Renormalize to use the same sample definitions as the @@ -278,6 +291,10 @@ def train (self, training_data, training_z): ng_tot_goal = ndens_arcmin * area_arcmin ng_tot_curr = np.sum(np.array(tab)) nzs = [nz * ng_tot_goal / ng_tot_curr for nz in nzs] + + #np.save('plots/nzs_norm.npy', nzs) + + #print(nzs) # Now initialize a S/N calculator for these initial groups. os.system('rm wl_nb%d.npz' % n_bin) @@ -299,15 +316,14 @@ def minus_sn(edges, calc): print(" ") print(res) + #Construct the per-group Nz + print("Outputting trained SOM and redshift ordering of SOM groups") groups_in_tomo = c_wl.assign_from_edges(res.x, get_ids=True) group_bins = np.zeros(num_groups) for bin_no, groups in groups_in_tomo: print(bin_no, groups) group_bins[groups] = bin_no - #Construct the per-group Nz - print("Outputting trained SOM and redshift ordering of SOM groups") - print("Finished") self.train_som = train_som self.group_bins = group_bins @@ -329,6 +345,8 @@ def apply (self, data): #Number of tomographic bins n_bin = self.opt['bins'] + #Number of discrete SOM groups + num_groups = self.opt['num_groups'] print("Preparing the data") data = pd.DataFrame.from_dict(data) @@ -336,24 +354,22 @@ def apply (self, data): #Construct the validation data frame (just a python-to-R data conversion) print("Converting the data to R format") with localconverter(ro.default_converter + pandas2ri.converter): - #train_df = ro.conversion.py2rpy(train[['u_mag','g_mag','r_mag','i_mag','z_mag','y_mag']]) data_df = ro.conversion.py2rpy(data) print("Parsing the validation data into the SOM groupings") #Generate the validation associations/groups group_prop=kohonen.generate_kohgroup_property(som=self.train_som,data=data_df, expression="nrow(data)",expr_label="N", - n_cluster_bins=self.opt['num_groups'],n_cores=self.opt['num_threads']) + n_cluster_bins=num_groups,n_cores=self.opt['num_threads']) #extract the validation som (just for convenience) valid_som = group_prop.rx2('som') #Assign the sources, by group, to tomographic bins print("Output source tomographic bin assignments") - valid_bin = base.unsplit(self.group_bins,FactorVector(valid_som.rx2['clust.classif'], - levels=base.seq(self.opt['num_groups'])),drop=False) - #valid_bin = valid_som.rx2['clust.classif'] - valid_bin = np.array(valid_bin)-1 + valid_bin = base.unsplit(IntVector(self.group_bins),FactorVector(valid_som.rx2['clust.classif'], + levels=base.seq(num_groups)),drop=False) + valid_bin = np.array(valid_bin) return valid_bin From 10eef4688d4a7eecb79fa6a75f0f916c2148b980 Mon Sep 17 00:00:00 2001 From: David Alonso Date: Mon, 31 Aug 2020 21:53:38 +0100 Subject: [PATCH 16/35] spline integrator --- the_atlantics/optimizer.py | 10 ++++++++-- the_atlantics/snc.py | 8 ++++++-- tomo_challenge/snc.py | 8 ++++++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/the_atlantics/optimizer.py b/the_atlantics/optimizer.py index 7fce42c4..d64bedc4 100644 --- a/the_atlantics/optimizer.py +++ b/the_atlantics/optimizer.py @@ -101,11 +101,17 @@ def get_bin_name(i): n_here = simps(n, x=zz) f = n_here / n_tot frac = "%.1lf" % (100 * f) + if i == -1: + bin_name = 'Trash bin' + else: + bin_name = f'Bin {i}' plt.plot(zz, n/n_here, - label=f'Bin {i} ({frac} %)') + label=bin_name + f' ({frac} %)') plt.legend() plt.xlabel(r'$z$', fontsize=15) plt.ylabel(r'$p(z)$', fontsize=15) -plt.show() +plt.savefig(o.prefix_out + '_nz_summary.png', + bbox_inches='tight') np.savez(o.prefix_out + '_bin_info.npz', **d_out) +plt.show() diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py index 1aca43c0..0710ab3f 100644 --- a/the_atlantics/snc.py +++ b/the_atlantics/snc.py @@ -14,7 +14,7 @@ class SnCalc(object): edges_large = 100. def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, - s_gamma=0.26, use_clustering=False): + s_gamma=0.26, use_clustering=False, integrator='spline'): """ S/N calculator Args: z_arr (array_like): array of redshifts at which all the N(z)s are @@ -32,7 +32,10 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, `use_clustering=False`). use_clustering (bool): if `True`, SNR will be computed for clustering instead of lensing. + integrator (string): CCL integration method. Either 'qag_quad' + or 'spline'. """ + self.integrator = integrator self.s_gamma = s_gamma self.fsky = fsky self.lmax = lmax @@ -90,7 +93,8 @@ def get_cl_matrix(self, fname_save=None, recompute=False): self.cls = np.zeros([self.n_ell, self.n_samples, self.n_samples]) for i in range(self.n_samples): for j in range(i, self.n_samples): - cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr) + cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr, + limber_integration_method=self.integrator) self.cls[:, i, j] = cl if j != i: self.cls[:, j, i] = cl diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py index 1aca43c0..0710ab3f 100644 --- a/tomo_challenge/snc.py +++ b/tomo_challenge/snc.py @@ -14,7 +14,7 @@ class SnCalc(object): edges_large = 100. def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, - s_gamma=0.26, use_clustering=False): + s_gamma=0.26, use_clustering=False, integrator='spline'): """ S/N calculator Args: z_arr (array_like): array of redshifts at which all the N(z)s are @@ -32,7 +32,10 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, `use_clustering=False`). use_clustering (bool): if `True`, SNR will be computed for clustering instead of lensing. + integrator (string): CCL integration method. Either 'qag_quad' + or 'spline'. """ + self.integrator = integrator self.s_gamma = s_gamma self.fsky = fsky self.lmax = lmax @@ -90,7 +93,8 @@ def get_cl_matrix(self, fname_save=None, recompute=False): self.cls = np.zeros([self.n_ell, self.n_samples, self.n_samples]) for i in range(self.n_samples): for j in range(i, self.n_samples): - cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr) + cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr, + limber_integration_method=self.integrator) self.cls[:, i, j] = cl if j != i: self.cls[:, j, i] = cl From a6045b788af1a833a9e678936b8297b92ac121e9 Mon Sep 17 00:00:00 2001 From: David Alonso Date: Mon, 31 Aug 2020 22:12:29 +0100 Subject: [PATCH 17/35] flake8 --- the_atlantics/snc.py | 40 ++++++++++++++++++++-------------------- tomo_challenge/snc.py | 40 ++++++++++++++++++++-------------------- 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py index 8ef2d31d..dc05921d 100644 --- a/the_atlantics/snc.py +++ b/the_atlantics/snc.py @@ -5,9 +5,9 @@ assign_params_default = {'p_inbin_thr': 0.5, - 'p_outbin_thr': 0.2, - 'use_p_inbin': False, - 'use_p_outbin': False} + 'p_outbin_thr': 0.2, + 'use_p_inbin': False, + 'use_p_outbin': False} class SnCalc(object): @@ -60,7 +60,7 @@ def _bz_model(self, cosmo, z): def get_cl_matrix(self, fname_save=None, recompute=False): """ Computes matrix of power spectra between all the initial groups. Args: - fname_save (string): if not None, the result will be saved to a + fname_save (string): if not None, the result will be saved to a file by this name, and if present and `recompute=False`, the matrix will be read from that file. recompute (bool): if `True`, any previous calculations of the @@ -74,22 +74,22 @@ def get_cl_matrix(self, fname_save=None, recompute=False): # Cosmology cosmo = ccl.Cosmology(Omega_c=0.25, - Omega_b=0.05, - h=0.67, n_s=0.96, - sigma8=0.81) + Omega_b=0.05, + h=0.67, n_s=0.96, + sigma8=0.81) # Tracers if self.use_clustering: trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz), - bias=(self.z_arr, - self._bz_model(cosmo, - self.z_arr))) - for nz in self.nz_list] + bias=(self.z_arr, + self._bz_model(cosmo, + self.z_arr))) + for nz in self.nz_list] else: trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz)) - for nz in self.nz_list] + for nz in self.nz_list] - # Cls + # Cls self.cls = np.zeros([self.n_ell, self.n_samples, self.n_samples]) for i in range(self.n_samples): for j in range(i, self.n_samples): @@ -161,10 +161,10 @@ def get_sn_wn(self, weights, n_dens, full_output=False): icl = np.linalg.inv(cl) sn2_1pt = np.sum(sl[:, :, :, None] * icl[:, None, :, :], axis=2) sn2_ell = np.sum(sn2_1pt[:, :, :, None] * sn2_1pt[:, None, :, :], - axis=2) + axis=2) trsn2_ell = np.trace(sn2_ell, axis1=1, axis2=2) snr = np.sqrt(np.sum(trsn2_ell * (2*self.larr + 1) * - self.d_ell / self.fsky)) + self.d_ell / self.fsky)) if full_output: dret = {'snr': snr, 'cl': sl, 'nl': nl, 'ls': self.larr} @@ -175,26 +175,26 @@ def get_sn_wn(self, weights, n_dens, full_output=False): def get_sn(self, assign, full_output=False): """ Compute signal-to-noise ratio from a bin assignment list. Args: - assign (list): list of arrays. Each element of the list should + assign (list): list of arrays. Each element of the list should be a numpy array of integers, corresponding to the indices of the initial groups that are now resampled into a larger group. full_output (bool): if true, a dictionary with additional information will be returned. Otherwise just total S/N. Returns: - If `full_output=True`, dictionary containing S/N, power + If `full_output=True`, dictionary containing S/N, power spectra and noise spectra. Otherwise just S/N. """ w, ndens = self.get_resample_metadata(assign) return self.get_sn_wn(w, ndens, - full_output=full_output) + full_output=full_output) def check_edges(self, edges): """ Returns `True` if there's something wrong with the edges. """ return np.any(edges < 0) or \ - np.any(edges > self.edges_large) or \ - np.any(np.diff(edges) < 0) + np.any(edges > self.edges_large) or \ + np.any(np.diff(edges) < 0) def get_nzs_from_edges(self, edges, assign_params=assign_params_default): assign = self.assign_from_edges(edges, assign_params=assign_params) diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py index 8ef2d31d..dc05921d 100644 --- a/tomo_challenge/snc.py +++ b/tomo_challenge/snc.py @@ -5,9 +5,9 @@ assign_params_default = {'p_inbin_thr': 0.5, - 'p_outbin_thr': 0.2, - 'use_p_inbin': False, - 'use_p_outbin': False} + 'p_outbin_thr': 0.2, + 'use_p_inbin': False, + 'use_p_outbin': False} class SnCalc(object): @@ -60,7 +60,7 @@ def _bz_model(self, cosmo, z): def get_cl_matrix(self, fname_save=None, recompute=False): """ Computes matrix of power spectra between all the initial groups. Args: - fname_save (string): if not None, the result will be saved to a + fname_save (string): if not None, the result will be saved to a file by this name, and if present and `recompute=False`, the matrix will be read from that file. recompute (bool): if `True`, any previous calculations of the @@ -74,22 +74,22 @@ def get_cl_matrix(self, fname_save=None, recompute=False): # Cosmology cosmo = ccl.Cosmology(Omega_c=0.25, - Omega_b=0.05, - h=0.67, n_s=0.96, - sigma8=0.81) + Omega_b=0.05, + h=0.67, n_s=0.96, + sigma8=0.81) # Tracers if self.use_clustering: trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz), - bias=(self.z_arr, - self._bz_model(cosmo, - self.z_arr))) - for nz in self.nz_list] + bias=(self.z_arr, + self._bz_model(cosmo, + self.z_arr))) + for nz in self.nz_list] else: trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz)) - for nz in self.nz_list] + for nz in self.nz_list] - # Cls + # Cls self.cls = np.zeros([self.n_ell, self.n_samples, self.n_samples]) for i in range(self.n_samples): for j in range(i, self.n_samples): @@ -161,10 +161,10 @@ def get_sn_wn(self, weights, n_dens, full_output=False): icl = np.linalg.inv(cl) sn2_1pt = np.sum(sl[:, :, :, None] * icl[:, None, :, :], axis=2) sn2_ell = np.sum(sn2_1pt[:, :, :, None] * sn2_1pt[:, None, :, :], - axis=2) + axis=2) trsn2_ell = np.trace(sn2_ell, axis1=1, axis2=2) snr = np.sqrt(np.sum(trsn2_ell * (2*self.larr + 1) * - self.d_ell / self.fsky)) + self.d_ell / self.fsky)) if full_output: dret = {'snr': snr, 'cl': sl, 'nl': nl, 'ls': self.larr} @@ -175,26 +175,26 @@ def get_sn_wn(self, weights, n_dens, full_output=False): def get_sn(self, assign, full_output=False): """ Compute signal-to-noise ratio from a bin assignment list. Args: - assign (list): list of arrays. Each element of the list should + assign (list): list of arrays. Each element of the list should be a numpy array of integers, corresponding to the indices of the initial groups that are now resampled into a larger group. full_output (bool): if true, a dictionary with additional information will be returned. Otherwise just total S/N. Returns: - If `full_output=True`, dictionary containing S/N, power + If `full_output=True`, dictionary containing S/N, power spectra and noise spectra. Otherwise just S/N. """ w, ndens = self.get_resample_metadata(assign) return self.get_sn_wn(w, ndens, - full_output=full_output) + full_output=full_output) def check_edges(self, edges): """ Returns `True` if there's something wrong with the edges. """ return np.any(edges < 0) or \ - np.any(edges > self.edges_large) or \ - np.any(np.diff(edges) < 0) + np.any(edges > self.edges_large) or \ + np.any(np.diff(edges) < 0) def get_nzs_from_edges(self, edges, assign_params=assign_params_default): assign = self.assign_from_edges(edges, assign_params=assign_params) From 42913cc3d6ed8c75823a7ea186021aec6e9083a5 Mon Sep 17 00:00:00 2001 From: Angus Wright Date: Tue, 1 Sep 2020 11:41:50 +0200 Subject: [PATCH 18/35] updated gitignore --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index bc56b5a7..2b112cc2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ +data/* plots/* - +wl_*.npz +*.pkl From dc0512853dd1db13fba319be648c7793e15f0978 Mon Sep 17 00:00:00 2001 From: Angus Wright Date: Tue, 1 Sep 2020 11:42:20 +0200 Subject: [PATCH 19/35] updated gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 2b112cc2..94242df7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +data data/* plots/* wl_*.npz From 453e89927a3cf873a4bb7a3d4e03bb901c0de158 Mon Sep 17 00:00:00 2001 From: Angus Wright Date: Tue, 1 Sep 2020 11:48:33 +0200 Subject: [PATCH 20/35] added save of SOM --- example/ComplexSOM.yaml | 12 ++++--- tomo_challenge/classifiers/ComplexSOM.py | 41 +++++++++++++++++++----- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/example/ComplexSOM.yaml b/example/ComplexSOM.yaml index e19130f4..4e386b5f 100644 --- a/example/ComplexSOM.yaml +++ b/example/ComplexSOM.yaml @@ -1,8 +1,8 @@ metrics: [SNR_3x2] bands: riz training_file: data/training.hdf5 -validation_file: data/validation_small.hdf5 -output_file: example/ComplexSOM_output.txt +validation_file: data/validation.hdf5 +output_file: example/ComplexSOM_output_manygroup.txt # Backend implementing the metrics, either: "firecrown" (default), "jax-cosmo" metrics_impl: firecrown @@ -11,16 +11,18 @@ run: ComplexSOM: run_3: # These settings are sent to the classifier - bins: 5 - som_dim: [51,51] + bins: 7 + som_dim: [21,21] num_groups: 100 num_threads: 32 group_type: 'redshift' data_threshold: [0,30] - sparse_frac: 0.15 + sparse_frac: 0.05 plots: True plot_dir: 'plots' metric: "SNR_3x2" + use_inbin: True + use_outbin: True # These special settings decide whether the # color and error colums are passed to the classifier # as well as the magnitudes diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py index ea37a806..18704aa1 100644 --- a/tomo_challenge/classifiers/ComplexSOM.py +++ b/tomo_challenge/classifiers/ComplexSOM.py @@ -25,6 +25,7 @@ from ..snc import SnCalc from scipy.optimize import minimize from scipy.integrate import simps +import pickle #Check that all the needed packages are installed # R package nameo @@ -50,7 +51,8 @@ class ComplexSOM(Tomographer): # valid parameter -- see below valid_options = ['bins','som_dim','num_groups','num_threads', - 'group_type','data_threshold','sparse_frac','plots','plot_dir','metric'] + 'group_type','data_threshold','sparse_frac','plots', + 'plot_dir','metric','use_inbin','use_outbin'] # this settings means arrays will be sent to train and apply instead # of dictionaries wants_arrays = False @@ -109,6 +111,10 @@ def train (self, training_data, training_z): data_threshold = self.opt['data_threshold'] #Metric metric = self.opt['metric'] + #Flag bad bins with inbin probability + use_inbin = self.opt['use_inbin'] + #Flag bad bins with outbin probability + use_outbin = self.opt['use_outbin'] #Plots plots = self.opt['plots'] #Plot Output Directory @@ -118,6 +124,11 @@ def train (self, training_data, training_z): #Check that the group type is a valid choice if group_type != 'colour' and group_type != 'redshift': group_type = 'redshift' + #Define the assign_params variable, used in bin optimisation + assign_params = {'p_inbin_thr': 0.5, + 'p_outbin_thr': 0.2, + 'use_p_inbin': use_inbin, + 'use_p_outbin': use_outbin} #Define the redshift summary statistics (used for making groups in the 'redshift' case property_labels = ("mean_z_true","med_z_true","sd_z_true","mad_z_true","iqr_z_true") @@ -190,10 +201,22 @@ def train (self, training_data, training_z): #train_df = ro.conversion.py2rpy(train[['u_mag','g_mag','r_mag','i_mag','z_mag','y_mag']]) train_df = ro.conversion.py2rpy(training_data) - print("Training the SOM using R kohtrain") - #Train the SOM using R kohtrain - som=kohonen.kohtrain(data=train_df,som_dim=IntVector(som_dim),max_na_frac=0,data_threshold=FloatVector(data_threshold), - n_cores=num_threads,train_expr=StrVector(expressions),train_sparse=False,sparse_frac=sparse_frac) + #Construct or Load the SOM + som_outname = f"SOM_{som_dim}_{self.bands}.pkl" + if not os.path.exists(som_outname): + print("Training the SOM using R kohtrain") + #Train the SOM using R kohtrain + som=kohonen.kohtrain(data=train_df,som_dim=IntVector(som_dim),max_na_frac=0,data_threshold=FloatVector(data_threshold), + n_cores=num_threads,train_expr=StrVector(expressions),train_sparse=False,sparse_frac=sparse_frac) + #Output the SOM + #base.save(som,file=som_outname) + with open(som_outname, 'wb') as f: + pickle.dump(som, f) + else: + print("Loading the pretrained SOM") + with open(som_outname, 'rb') as f: + som = pickle.load(f) + som.rx2['unit.classif']=FloatVector([]) #If grouping by redshift, construct the cell redshift statistics if group_type == 'redshift' or plots == True: @@ -203,7 +226,9 @@ def train (self, training_data, training_z): expression=StrVector(property_expressions),expr_label=StrVector(property_labels)) print("Constructing redshift-based hierarchical cluster tree") #Cluster the SOM cells into num_groups groups - props = cell_prop.rx2['property'] + props = kohonen.kohwhiten(cell_prop.rx2['property'],train_expr=base.colnames(cell_prop.rx2['property']), + data_missing='NA',data_threshold=FloatVector([0,12])) + props = props.rx2("data.white") props.rx[base.which(base.is_na(props))] = -1 print(base.summary(props)) hclust=stats.hclust(stats.dist(props)) @@ -306,13 +331,13 @@ def train (self, training_data, training_z): c_wl.get_cl_matrix(fname_save='wl_nb%d.npz' % n_bin) # Finally, let's write the function to minimize and optimize for a 4-bin case. def minus_sn(edges, calc): - return -calc.get_sn_from_edges(edges) + return -calc.get_sn_from_edges(edges,assign_params=assign_params) print("Optimizing WL") edges_0 = np.linspace(0, 2, n_bin-1) res = minimize(minus_sn, edges_0, method='Powell', args=(c_wl,)) print("WL final edges: ", res.x) - print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x)*np.sqrt(0.25/fsky)) + print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x)*np.sqrt(0.25/fsky),assign_params=assign_params) print(" ") print(res) From 26f138592493ba75507403a495bb1c867bfde7f9 Mon Sep 17 00:00:00 2001 From: Angus Wright Date: Tue, 1 Sep 2020 11:49:04 +0200 Subject: [PATCH 21/35] changed settings --- example/ComplexSOM.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example/ComplexSOM.yaml b/example/ComplexSOM.yaml index 4e386b5f..56f20ff9 100644 --- a/example/ComplexSOM.yaml +++ b/example/ComplexSOM.yaml @@ -12,12 +12,12 @@ run: run_3: # These settings are sent to the classifier bins: 7 - som_dim: [21,21] + som_dim: [51,51] num_groups: 100 num_threads: 32 group_type: 'redshift' data_threshold: [0,30] - sparse_frac: 0.05 + sparse_frac: 0.25 plots: True plot_dir: 'plots' metric: "SNR_3x2" From a2d7355ee6d932ca2e06e2b2b399d0cfdcb95262 Mon Sep 17 00:00:00 2001 From: Angus Wright Date: Tue, 1 Sep 2020 15:10:16 +0200 Subject: [PATCH 22/35] fix bug in SNR calculation call --- tomo_challenge/classifiers/ComplexSOM.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py index 18704aa1..d83d3d7f 100644 --- a/tomo_challenge/classifiers/ComplexSOM.py +++ b/tomo_challenge/classifiers/ComplexSOM.py @@ -337,7 +337,7 @@ def minus_sn(edges, calc): edges_0 = np.linspace(0, 2, n_bin-1) res = minimize(minus_sn, edges_0, method='Powell', args=(c_wl,)) print("WL final edges: ", res.x) - print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x)*np.sqrt(0.25/fsky),assign_params=assign_params) + print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x,assign_params=assign_params)*np.sqrt(0.25/fsky)) print(" ") print(res) From 7fae683fc9c876a09777925eb00b3d98a8cad969 Mon Sep 17 00:00:00 2001 From: Angus Wright Date: Tue, 1 Sep 2020 15:47:05 +0200 Subject: [PATCH 23/35] speed up when using pretrained SOM --- tomo_challenge/classifiers/ComplexSOM.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py index d83d3d7f..a3064b07 100644 --- a/tomo_challenge/classifiers/ComplexSOM.py +++ b/tomo_challenge/classifiers/ComplexSOM.py @@ -224,6 +224,8 @@ def train (self, training_data, training_z): #Construct the Nz properties per SOM cell cell_prop=kohonen.generate_kohgroup_property(som=som,data=train_df, expression=StrVector(property_expressions),expr_label=StrVector(property_labels)) + som=cell_prop.rx2('som') + som.rx2['clust.classif']=FloatVector([]) print("Constructing redshift-based hierarchical cluster tree") #Cluster the SOM cells into num_groups groups props = kohonen.kohwhiten(cell_prop.rx2['property'],train_expr=base.colnames(cell_prop.rx2['property']), From 83d1bf650a72ee4f88053b99e7977758e05398ed Mon Sep 17 00:00:00 2001 From: David Alonso Date: Tue, 1 Sep 2020 15:04:56 +0100 Subject: [PATCH 24/35] safe integration --- the_atlantics/snc.py | 12 ++++++------ tomo_challenge/snc.py | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py index dc05921d..db83e1c8 100644 --- a/the_atlantics/snc.py +++ b/the_atlantics/snc.py @@ -219,23 +219,23 @@ def assign_from_edges(self, edges, assign_params=assign_params_default, # Bin IDs based on mean z ids = np.digitize(self.z_means, bins=edges) if assign_params['use_p_inbin'] or assign_params['use_p_outbin']: + def integrate_safe(p, z, m): + if np.sum(m) == 0: + return 0 + else: + return simps(p[m], x=z[m]) # Calculate probabilities in each bin zgroups = np.digitize(self.z_arr, bins=edges) masks = [zgroups == i for i in range(nbins)] if assign_params['use_p_outbin']: # Matrix of probabilities in each bin - pzs = np.array([[simps(pz[m], x=self.z_arr[m]) + pzs = np.array([[integrate_safe(pz, self.z_arr, m) for m in masks] for pz in self.pz_list]) pzd = np.array([pzs[j, ids[j]] for j in range(self.n_samples)]) else: # Overlaps in own bin - def integrate_safe(p, z, m): - if np.sum(m) == 0: - return 0 - else: - return simps(p[m], x=z[m]) pzd = np.array([integrate_safe(pz, self.z_arr, masks[i]) for pz, i in zip(self.pz_list, ids)]) # Throw away based on in-bin probability diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py index dc05921d..db83e1c8 100644 --- a/tomo_challenge/snc.py +++ b/tomo_challenge/snc.py @@ -219,23 +219,23 @@ def assign_from_edges(self, edges, assign_params=assign_params_default, # Bin IDs based on mean z ids = np.digitize(self.z_means, bins=edges) if assign_params['use_p_inbin'] or assign_params['use_p_outbin']: + def integrate_safe(p, z, m): + if np.sum(m) == 0: + return 0 + else: + return simps(p[m], x=z[m]) # Calculate probabilities in each bin zgroups = np.digitize(self.z_arr, bins=edges) masks = [zgroups == i for i in range(nbins)] if assign_params['use_p_outbin']: # Matrix of probabilities in each bin - pzs = np.array([[simps(pz[m], x=self.z_arr[m]) + pzs = np.array([[integrate_safe(pz, self.z_arr, m) for m in masks] for pz in self.pz_list]) pzd = np.array([pzs[j, ids[j]] for j in range(self.n_samples)]) else: # Overlaps in own bin - def integrate_safe(p, z, m): - if np.sum(m) == 0: - return 0 - else: - return simps(p[m], x=z[m]) pzd = np.array([integrate_safe(pz, self.z_arr, masks[i]) for pz, i in zip(self.pz_list, ids)]) # Throw away based on in-bin probability From 0bb9f7999883cc660527b26d0c2ffa8201efcb95 Mon Sep 17 00:00:00 2001 From: Angus Wright Date: Fri, 4 Sep 2020 18:06:03 +0200 Subject: [PATCH 25/35] Merge branch 'the_atlantics' of github.com:damonge/tomo_challenge into the_atlantics --- example/ComplexSOM.yaml | 14 +++++++------- tomo_challenge/classifiers/ComplexSOM.py | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/example/ComplexSOM.yaml b/example/ComplexSOM.yaml index 56f20ff9..5670a39c 100644 --- a/example/ComplexSOM.yaml +++ b/example/ComplexSOM.yaml @@ -1,8 +1,8 @@ metrics: [SNR_3x2] bands: riz -training_file: data/training.hdf5 -validation_file: data/validation.hdf5 -output_file: example/ComplexSOM_output_manygroup.txt +training_file: data_buzzard/training.hdf5 +validation_file: data_buzzard/validation.hdf5 +output_file: example/ComplexSOM_output.txt # Backend implementing the metrics, either: "firecrown" (default), "jax-cosmo" metrics_impl: firecrown @@ -11,18 +11,18 @@ run: ComplexSOM: run_3: # These settings are sent to the classifier - bins: 7 + bins: 5 som_dim: [51,51] num_groups: 100 num_threads: 32 group_type: 'redshift' data_threshold: [0,30] - sparse_frac: 0.25 + sparse_frac: 0.5 plots: True plot_dir: 'plots' metric: "SNR_3x2" - use_inbin: True - use_outbin: True + use_inbin: False + use_outbin: False # These special settings decide whether the # color and error colums are passed to the classifier # as well as the magnitudes diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py index a3064b07..d82fb034 100644 --- a/tomo_challenge/classifiers/ComplexSOM.py +++ b/tomo_challenge/classifiers/ComplexSOM.py @@ -223,7 +223,7 @@ def train (self, training_data, training_z): print("Constructing cell-based redshift properties") #Construct the Nz properties per SOM cell cell_prop=kohonen.generate_kohgroup_property(som=som,data=train_df, - expression=StrVector(property_expressions),expr_label=StrVector(property_labels)) + expression=StrVector(property_expressions),expr_label=StrVector(property_labels),returnMatrix=True) som=cell_prop.rx2('som') som.rx2['clust.classif']=FloatVector([]) print("Constructing redshift-based hierarchical cluster tree") @@ -244,7 +244,7 @@ def train (self, training_data, training_z): print("Constructing group-based redshift properties") group_prop=kohonen.generate_kohgroup_property(som=som,data=train_df, expression=StrVector(property_expressions),expr_label=StrVector(property_labels), - n_cluster_bins=num_groups) + n_cluster_bins=num_groups,returnMatrix=True) #extract the training som (just for convenience) train_som = group_prop.rx2('som') @@ -387,7 +387,7 @@ def apply (self, data): #Generate the validation associations/groups group_prop=kohonen.generate_kohgroup_property(som=self.train_som,data=data_df, expression="nrow(data)",expr_label="N", - n_cluster_bins=num_groups,n_cores=self.opt['num_threads']) + n_cluster_bins=num_groups,n_cores=self.opt['num_threads'],returnMatrix=True) #extract the validation som (just for convenience) valid_som = group_prop.rx2('som') From 038dcf669f10cbbed05545bac0195b2ff4ad334b Mon Sep 17 00:00:00 2001 From: Angus Wright Date: Tue, 8 Sep 2020 15:07:11 +0200 Subject: [PATCH 26/35] added testing mode --- bin/challenge.py | 20 ++- example/ComplexSOM.yaml | 29 +++- tomo_challenge/classifiers/ComplexSOM.py | 172 ++++++++++++++--------- tomo_challenge/data.py | 11 +- 4 files changed, 151 insertions(+), 81 deletions(-) diff --git a/bin/challenge.py b/bin/challenge.py index 5ec60273..dcabe9e4 100755 --- a/bin/challenge.py +++ b/bin/challenge.py @@ -27,12 +27,15 @@ def main(config_yaml): # Decide if anyone needs the colors calculating and/or errors loading anyone_wants_colors = False anyone_wants_errors = False + anyone_wants_sizes = False for run in config['run'].values(): for version in run.values(): if version.get('errors'): anyone_wants_errors = True if version.get('colors'): anyone_wants_colors = True + if version.get('sizes'): + anyone_wants_sizes = True bands = config['bands'] @@ -41,14 +44,16 @@ def main(config_yaml): config['training_file'], bands, errors=anyone_wants_errors, - colors=anyone_wants_colors + colors=anyone_wants_colors, + size=anyone_wants_sizes ) validation_data = tc.load_data( config['validation_file'], bands, errors=anyone_wants_errors, - colors=anyone_wants_colors + colors=anyone_wants_colors, + size=anyone_wants_sizes ) training_z = tc.load_redshift(config['training_file']) @@ -59,7 +64,7 @@ def main(config_yaml): else: metrics_fn = tc.compute_scores - with open(config['output_file'],'w') as output_file: + with open(config['output_file'],'a') as output_file: for classifier_name, runs in config['run'].items(): for run, settings in runs.items(): scores = run_one(classifier_name, bands, settings, @@ -77,15 +82,16 @@ def run_one(classifier_name, bands, settings, train_data, train_z, valid_data, if classifier.wants_arrays: errors = settings.get('errors') colors = settings.get('colors') - train_data = tc.dict_to_array(train_data, bands, errors=errors, colors=colors) - valid_data = tc.dict_to_array(valid_data, bands, errors=errors, colors=colors) + sizes = settings.get('sizes') + train_data = tc.dict_to_array(train_data, bands, errors=errors, colors=colors, sizes=sizes) + valid_data = tc.dict_to_array(valid_data, bands, errors=errors, colors=colors, sizes=sizes) print ("Executing: ", classifier_name, bands, settings) ## first check if options are valid print (settings, classifier.valid_options) for key in settings.keys(): - if key not in classifier.valid_options and key not in ['errors', 'colors']: + if key not in classifier.valid_options and key not in ['errors', 'colors', 'sizes']: raise ValueError(f"Key {key} is not recognized by classifier {classifier_name}") print ("Initializing classifier...") @@ -102,7 +108,7 @@ def run_one(classifier_name, bands, settings, train_data, train_z, valid_data, print ("Making some pretty plots...") name = str(classifier.__name__) - tc.metrics.plot_distributions(valid_z, results, f"plots/{name}_{settings}_{bands}.png") + tc.metrics.plot_distributions(valid_z, results, f"plots/{name}_{bands}.png") return scores diff --git a/example/ComplexSOM.yaml b/example/ComplexSOM.yaml index 5670a39c..0f19205f 100644 --- a/example/ComplexSOM.yaml +++ b/example/ComplexSOM.yaml @@ -9,15 +9,16 @@ metrics_impl: firecrown run: # This is a class name which will be looked up ComplexSOM: - run_3: + run_5: # These settings are sent to the classifier + testing: True bins: 5 som_dim: [51,51] - num_groups: 100 + num_groups: 200 num_threads: 32 group_type: 'redshift' data_threshold: [0,30] - sparse_frac: 0.5 + sparse_frac: 1.0 plots: True plot_dir: 'plots' metric: "SNR_3x2" @@ -28,4 +29,26 @@ run: # as well as the magnitudes colors: False errors: False + sizes: False + run_10: + # These settings are sent to the classifier + testing: False + bins: 10 + som_dim: [51,51] + num_groups: 200 + num_threads: 32 + group_type: 'redshift' + data_threshold: [0,30] + sparse_frac: 1.0 + plots: True + plot_dir: 'plots' + metric: "SNR_3x2" + use_inbin: False + use_outbin: False + # These special settings decide whether the + # color and error colums are passed to the classifier + # as well as the magnitudes + colors: False + errors: False + sizes: False diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py index d82fb034..915ba17e 100644 --- a/tomo_challenge/classifiers/ComplexSOM.py +++ b/tomo_challenge/classifiers/ComplexSOM.py @@ -52,7 +52,7 @@ class ComplexSOM(Tomographer): # valid parameter -- see below valid_options = ['bins','som_dim','num_groups','num_threads', 'group_type','data_threshold','sparse_frac','plots', - 'plot_dir','metric','use_inbin','use_outbin'] + 'plot_dir','metric','use_inbin','use_outbin','testing'] # this settings means arrays will be sent to train and apply instead # of dictionaries wants_arrays = False @@ -138,85 +138,117 @@ def train (self, training_data, training_z): #Define the SOM variables if self.bands == 'riz': #riz bands - #expressions = ("r_mag-i_mag","r_mag-z_mag","i_mag-z_mag", - # "z_mag","r_mag-i_mag-(i_mag-z_mag)") - expressions = ("r-i","r-z","i-z", - "z","r-i-(i-z)") + if self.opt['sizes'] == True: + expressions = ("r-i","r-z","i-z", + "z","r-i-(i-z)","mcal_T") + else: + expressions = ("r-i","r-z","i-z", + "z","r-i-(i-z)") elif self.bands == 'griz': #griz bands - #expressions = ("g_mag-r_mag","g_mag-i_mag", - # "g_mag-z_mag","r_mag-i_mag","r_mag-z_mag","i_mag-z_mag", - # "z_mag","g_mag-r_mag-(r_mag-i_mag)", - # "r_mag-i_mag-(i_mag-z_mag)") - expressions = ("g-r","g-i", - "g-z","r-i","r-z","i-z", - "z","g-r-(r-i)", - "r-i-(i-z)") + if self.opt['sizes'] == True: + expressions = ("g-r","g-i", + "g-z","r-i","r-z","i-z", + "z","g-r-(r-i)", + "r-i-(i-z)","mcal_T") + else: + expressions = ("g-r","g-i", + "g-z","r-i","r-z","i-z", + "z","g-r-(r-i)", + "r-i-(i-z)") elif self.bands == 'grizy': #grizy bands - #expressions = ("g_mag-r_mag","g_mag-i_mag", - # "g_mag-z_mag","g_mag-y_mag","r_mag-i_mag","r_mag-z_mag","r_mag-y_mag","i_mag-z_mag","i_mag-y_mag", - # "z_mag-y_mag","z_mag","g_mag-r_mag-(r_mag-i_mag)", - # "r_mag-i_mag-(i_mag-z_mag)","i_mag-z_mag-(z_mag-y_mag)") - expressions = ("g-r","g-i", - "g-z","g-y","r-i","r-z","r-y","i-z","i-y", - "z-y","z","g-r-(r-i)", - "r-i-(i-z)","i-z-(z-y)") + if self.opt['sizes'] == True: + expressions = ("g-r","g-i", + "g-z","g-y","r-i","r-z","r-y","i-z","i-y", + "z-y","z","g-r-(r-i)", + "r-i-(i-z)","i-z-(z-y)","mcal_T") + else: + expressions = ("g-r","g-i", + "g-z","g-y","r-i","r-z","r-y","i-z","i-y", + "z-y","z","g-r-(r-i)", + "r-i-(i-z)","i-z-(z-y)") elif self.bands == 'ugriz': #ugrizy bands - #expressions = ("u_mag-g_mag","u_mag-r_mag","u_mag-i_mag","u_mag-z_mag","g_mag-r_mag","g_mag-i_mag", - # "g_mag-z_mag","r_mag-i_mag","r_mag-z_mag","i_mag-z_mag", - # "z_mag","u_mag-g_mag-(g_mag-r_mag)","g_mag-r_mag-(r_mag-i_mag)", - # "r_mag-i_mag-(i_mag-z_mag)") - expressions = ("u-g","u-r","u-i","u-z","g-r","g-i", - "g-z","r-i","r-z","i-z", - "z","u-g-(g-r)","g-r-(r-i)", - "r-i-(i-z)") + if self.opt['sizes'] == True: + expressions = ("u-g","u-r","u-i","u-z","g-r","g-i", + "g-z","r-i","r-z","i-z", + "z","u-g-(g-r)","g-r-(r-i)", + "r-i-(i-z)","mcal_T") + else: + expressions = ("u-g","u-r","u-i","u-z","g-r","g-i", + "g-z","r-i","r-z","i-z", + "z","u-g-(g-r)","g-r-(r-i)", + "r-i-(i-z)") elif self.bands == 'ugrizy': #ugrizy bands - #expressions = ("u_mag-g_mag","u_mag-r_mag","u_mag-i_mag","u_mag-z_mag","u_mag-y_mag","g_mag-r_mag","g_mag-i_mag", - # "g_mag-z_mag","g_mag-y_mag","r_mag-i_mag","r_mag-z_mag","r_mag-y_mag","i_mag-z_mag","i_mag-y_mag", - # "z_mag-y_mag","z_mag","u_mag-g_mag-(g_mag-r_mag)","g_mag-r_mag-(r_mag-i_mag)", - # "r_mag-i_mag-(i_mag-z_mag)","i_mag-z_mag-(z_mag-y_mag)") - expressions = ("u-g","u-r","u-i","u-z","u-y","g-r","g-i", - "g-z","g-y","r-i","r-z","r-y","i-z","i-y", - "z-y","z","u-g-(g-r)","g-r-(r-i)", - "r-i-(i-z)","i-z-(z-y)") - - print("Preparing the data") - training_data = pd.DataFrame.from_dict(training_data) - #Add the redshift variable to the train data - print("Adding redshift info to training data") - training_data['redshift_true'] = training_z - - if sparse_frac < 1: - print("Sparse Sampling the training data") - cut = np.random.uniform(0, 1, training_z.size) < sparse_frac - training_data = training_data[cut] - training_z = training_z[cut] - - #Construct the training data frame (just a python-to-R data conversion) - print("Converting the data to R format") - with localconverter(ro.default_converter + pandas2ri.converter): - #train_df = ro.conversion.py2rpy(train[['u_mag','g_mag','r_mag','i_mag','z_mag','y_mag']]) - train_df = ro.conversion.py2rpy(training_data) - - #Construct or Load the SOM - som_outname = f"SOM_{som_dim}_{self.bands}.pkl" - if not os.path.exists(som_outname): - print("Training the SOM using R kohtrain") - #Train the SOM using R kohtrain - som=kohonen.kohtrain(data=train_df,som_dim=IntVector(som_dim),max_na_frac=0,data_threshold=FloatVector(data_threshold), - n_cores=num_threads,train_expr=StrVector(expressions),train_sparse=False,sparse_frac=sparse_frac) - #Output the SOM - #base.save(som,file=som_outname) - with open(som_outname, 'wb') as f: - pickle.dump(som, f) + if self.opt['sizes'] == True: + expressions = ("u-g","u-r","u-i","u-z","u-y","g-r","g-i", + "g-z","g-y","r-i","r-z","r-y","i-z","i-y", + "z-y","z","u-g-(g-r)","g-r-(r-i)", + "r-i-(i-z)","i-z-(z-y)","mcal_T") + else: + expressions = ("u-g","u-r","u-i","u-z","u-y","g-r","g-i", + "g-z","g-y","r-i","r-z","r-y","i-z","i-y", + "z-y","z","u-g-(g-r)","g-r-(r-i)", + "r-i-(i-z)","i-z-(z-y)") + + + #Define the output SOM name + if self.opt['sizes'] == True: + som_outname = f"SOM_{som_dim}_{self.bands}_withsize.pkl" else: - print("Loading the pretrained SOM") - with open(som_outname, 'rb') as f: + som_outname = f"SOM_{som_dim}_{self.bands}.pkl" + + #If Testing, load the pretrained SOM and matching training catalogue + if self.opt['testing'] == True: + #Use the default training set + with open(f"test_{som_outname}", 'rb') as f: som = pickle.load(f) - som.rx2['unit.classif']=FloatVector([]) + print(base.length(som.rx2['unit.classif'])) + with open("training_testcat.pkl", 'rb') as f: + train_df = pickle.load(f) + print(utils.str(train_df)) + else: + #Use the training set provided + print("Preparing the data") + training_data = pd.DataFrame.from_dict(training_data) + #Add the redshift variable to the train data + print("Adding redshift info to training data") + training_data['redshift_true'] = training_z + + if sparse_frac < 1: + print("Sparse Sampling the training data") + cut = np.random.uniform(0, 1, training_z.size) < sparse_frac + training_data = training_data[cut] + training_z = training_z[cut] + + #Construct the training data frame (just a python-to-R data conversion) + print("Converting the data to R format") + with localconverter(ro.default_converter + pandas2ri.converter): + train_df = ro.conversion.py2rpy(training_data) + + #with open("training_testcat.pkl", 'wb') as f: + # pickle.dump(train_df, f) + + #Construct or Load the SOM + if not os.path.exists(som_outname): + print("Training the SOM using R kohtrain") + #Train the SOM using R kohtrain + som=kohonen.kohtrain(data=train_df,som_dim=IntVector(som_dim),max_na_frac=0,data_threshold=FloatVector(data_threshold), + n_cores=num_threads,train_expr=StrVector(expressions),train_sparse=False,sparse_frac=sparse_frac) + print(base.length(som.rx2['unit.classif'])) + #Output the SOM + #base.save(som,file=som_outname) + #with open(f"test_{som_outname}", 'wb') as f: + # pickle.dump(som, f) + with open(som_outname, 'wb') as f: + pickle.dump(som, f) + else: + print("Loading the pretrained SOM") + with open(som_outname, 'rb') as f: + som = pickle.load(f) + som.rx2['unit.classif']=FloatVector([]) #If grouping by redshift, construct the cell redshift statistics if group_type == 'redshift' or plots == True: diff --git a/tomo_challenge/data.py b/tomo_challenge/data.py index 03eadc5c..5b8a7af4 100644 --- a/tomo_challenge/data.py +++ b/tomo_challenge/data.py @@ -124,6 +124,12 @@ def add_colors(data, bands, errors=False): if errors: data[f'{b}{c}_err'] = np.sqrt(data[f'{b}_err']**2 + data[f'{c}_err']**2) +def add_size(data, filename): + + with h5py.File(filename, 'r') as f: + # load all bands + data['mcal_T'] = f[f'mcal_T'][:] + def dict_to_array(data, bands, errors=False, colors=False): nobj = data[bands[0]].size @@ -165,12 +171,15 @@ def colors_for_bands(bands): -def load_data(filename, bands, colors=False, errors=False, array=False): +def load_data(filename, bands, colors=False, errors=False, size=False, array=False): data = load_mags(filename, bands, errors=errors) if colors: add_colors(data, bands, errors=errors) + if size: + add_size(data, filename) + if array: data = dict_to_array(data, bands, errors=errors, colors=colors) From 51f357a750b3fa81ac81dbade8d287f5346162de Mon Sep 17 00:00:00 2001 From: Angus Wright Date: Tue, 8 Sep 2020 15:07:54 +0200 Subject: [PATCH 27/35] lowered number of groups to 100 --- example/ComplexSOM.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/ComplexSOM.yaml b/example/ComplexSOM.yaml index 0f19205f..e6aea060 100644 --- a/example/ComplexSOM.yaml +++ b/example/ComplexSOM.yaml @@ -14,7 +14,7 @@ run: testing: True bins: 5 som_dim: [51,51] - num_groups: 200 + num_groups: 100 num_threads: 32 group_type: 'redshift' data_threshold: [0,30] From 49aa17d4a01104fb55434b7839afcbe4aa7e7397 Mon Sep 17 00:00:00 2001 From: Angus Wright Date: Thu, 10 Sep 2020 12:19:13 +0200 Subject: [PATCH 28/35] fixed bug in group_type --- tomo_challenge/classifiers/ComplexSOM.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py index 915ba17e..5f2c8365 100644 --- a/tomo_challenge/classifiers/ComplexSOM.py +++ b/tomo_challenge/classifiers/ComplexSOM.py @@ -265,12 +265,16 @@ def train (self, training_data, training_z): props = props.rx2("data.white") props.rx[base.which(base.is_na(props))] = -1 print(base.summary(props)) + + if group_type=='redshift': + print("Groups will be constructed by redshift property") hclust=stats.hclust(stats.dist(props)) cell_group=stats.cutree(hclust,k=num_groups) - #Assign the cell groups to the SOM structure som.rx2['hclust']=hclust som.rx2['cell_clust']=cell_group + else: + print("Groups will be constructed by redshift property") #Construct the Nz properties per SOM group print("Constructing group-based redshift properties") From 5ca0871841ddae08e02b8f5a166b3e7b0c64369b Mon Sep 17 00:00:00 2001 From: Angus Wright Date: Thu, 10 Sep 2020 12:20:39 +0200 Subject: [PATCH 29/35] fixed print typo --- tomo_challenge/classifiers/ComplexSOM.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py index 5f2c8365..90d463b1 100644 --- a/tomo_challenge/classifiers/ComplexSOM.py +++ b/tomo_challenge/classifiers/ComplexSOM.py @@ -274,7 +274,7 @@ def train (self, training_data, training_z): som.rx2['hclust']=hclust som.rx2['cell_clust']=cell_group else: - print("Groups will be constructed by redshift property") + print("Groups will be constructed by colour property") #Construct the Nz properties per SOM group print("Constructing group-based redshift properties") From 860f54e847a6c2e32deddfa62b704a13cc32dfa6 Mon Sep 17 00:00:00 2001 From: Angus Wright Date: Fri, 11 Sep 2020 16:50:59 +0200 Subject: [PATCH 30/35] added redshift_propset and whiten options --- tomo_challenge/classifiers/ComplexSOM.py | 49 +++++++++++++++++------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py index 90d463b1..2ee47bc6 100644 --- a/tomo_challenge/classifiers/ComplexSOM.py +++ b/tomo_challenge/classifiers/ComplexSOM.py @@ -52,7 +52,8 @@ class ComplexSOM(Tomographer): # valid parameter -- see below valid_options = ['bins','som_dim','num_groups','num_threads', 'group_type','data_threshold','sparse_frac','plots', - 'plot_dir','metric','use_inbin','use_outbin','testing'] + 'plot_dir','metric','use_inbin','use_outbin','testing', + 'whiten','redshift_propset'] # this settings means arrays will be sent to train and apply instead # of dictionaries wants_arrays = False @@ -117,6 +118,10 @@ def train (self, training_data, training_z): use_outbin = self.opt['use_outbin'] #Plots plots = self.opt['plots'] + #redshift properties to use in grouping + redshift_propset = self.opt['redshift_propset'] + #Whiten the redshift data when grouping? + whiten = self.opt['whiten'] #Plot Output Directory plot_dir = self.opt['plot_dir'] #Group Type @@ -131,10 +136,24 @@ def train (self, training_data, training_z): 'use_p_outbin': use_outbin} #Define the redshift summary statistics (used for making groups in the 'redshift' case - property_labels = ("mean_z_true","med_z_true","sd_z_true","mad_z_true","iqr_z_true") - property_expressions = ("mean(data$redshift_true)","median(data$redshift_true)","sd(data$redshift_true)", - "mad(data$redshift_true)", - "diff(quantile(data$redshift_true,probs=pnorm(c(-2,2))))") + if redshift_propset==1: + property_labels = StrVector(("mean_z_true","med_z_true","sd_z_true","mad_z_true","iqr_z_true")) + property_expressions = StrVector(("mean(data$redshift_true)","median(data$redshift_true)","sd(data$redshift_true)", + "mad(data$redshift_true)", + "diff(quantile(data$redshift_true,probs=pnorm(c(-2,2))))")) + elif redshift_propset==2: + property_labels = StrVector(("mean_z_true","med_z_true","mad_z_true")) + property_expressions = StrVector(("mean(data$redshift_true)","median(data$redshift_true)", + "mad(data$redshift_true)")) + elif redshift_propset==3: + property_labels = StrVector(("quan_05","quan_16","quan_50","quan_84","quan_95")) + property_expressions = ("quantile(data$redshift_true,probs=c(5,16,50,84,95)/100)") + else: + property_labels = StrVector(("mean_z_true","med_z_true","sd_z_true","mad_z_true","iqr_z_true")) + property_expressions = StrVector(("mean(data$redshift_true)","median(data$redshift_true)","sd(data$redshift_true)", + "mad(data$redshift_true)", + "diff(quantile(data$redshift_true,probs=pnorm(c(-2,2))))")) + #Define the SOM variables if self.bands == 'riz': #riz bands @@ -193,7 +212,6 @@ def train (self, training_data, training_z): "z-y","z","u-g-(g-r)","g-r-(r-i)", "r-i-(i-z)","i-z-(z-y)") - #Define the output SOM name if self.opt['sizes'] == True: som_outname = f"SOM_{som_dim}_{self.bands}_withsize.pkl" @@ -255,15 +273,20 @@ def train (self, training_data, training_z): print("Constructing cell-based redshift properties") #Construct the Nz properties per SOM cell cell_prop=kohonen.generate_kohgroup_property(som=som,data=train_df, - expression=StrVector(property_expressions),expr_label=StrVector(property_labels),returnMatrix=True) + expression=property_expressions,expr_label=property_labels,returnMatrix=True) som=cell_prop.rx2('som') som.rx2['clust.classif']=FloatVector([]) print("Constructing redshift-based hierarchical cluster tree") #Cluster the SOM cells into num_groups groups - props = kohonen.kohwhiten(cell_prop.rx2['property'],train_expr=base.colnames(cell_prop.rx2['property']), - data_missing='NA',data_threshold=FloatVector([0,12])) - props = props.rx2("data.white") - props.rx[base.which(base.is_na(props))] = -1 + if whiten==True: + props = kohonen.kohwhiten(cell_prop.rx2['property'],train_expr=base.colnames(cell_prop.rx2['property']), + data_missing='NA',data_threshold=FloatVector([0,12])) + props = props.rx2("data.white") + props.rx[base.which(base.is_na(props))] = -10 + else: + props = cell_prop.rx2['property'] + props.rx[base.which(base.is_na(props))] = -1 + print(base.summary(props)) if group_type=='redshift': @@ -279,13 +302,13 @@ def train (self, training_data, training_z): #Construct the Nz properties per SOM group print("Constructing group-based redshift properties") group_prop=kohonen.generate_kohgroup_property(som=som,data=train_df, - expression=StrVector(property_expressions),expr_label=StrVector(property_labels), + expression=property_expressions,expr_label=property_labels, n_cluster_bins=num_groups,returnMatrix=True) #extract the training som (just for convenience) train_som = group_prop.rx2('som') - if plots == True: + if plots == True and redshift_propset==1: #Make the diagnostic plots props = group_prop.rx2('property') cprops = cell_prop.rx2('property') From 21510aa32640d4e3a87db413622f0c73f35c9ef3 Mon Sep 17 00:00:00 2001 From: David Alonso Date: Fri, 11 Sep 2020 17:45:49 +0100 Subject: [PATCH 31/35] update --- the_atlantics/optimizer.py | 92 +++++++++----- the_atlantics/snc.py | 148 +++++++++++++++++++---- tomo_challenge/classifiers/ComplexSOM.py | 50 +++++--- tomo_challenge/snc.py | 148 +++++++++++++++++++---- 4 files changed, 338 insertions(+), 100 deletions(-) diff --git a/the_atlantics/optimizer.py b/the_atlantics/optimizer.py index d64bedc4..f46dd508 100644 --- a/the_atlantics/optimizer.py +++ b/the_atlantics/optimizer.py @@ -4,7 +4,6 @@ from scipy.interpolate import interp1d from scipy.optimize import minimize, brentq from scipy.integrate import simps -import matplotlib.pyplot as plt parser = ArgumentParser() @@ -12,6 +11,8 @@ help="Path to input file containing the N(z) of all initial groups") parser.add_argument("--prefix-out", default='out', type=str, help="Prefix for all output files") +parser.add_argument("--suffix-out", default='', type=str, + help="Suffix for all output files") parser.add_argument("--n-bins", default=4, type=int, help="Number of bins (excluding a possible trash bin") parser.add_argument("--prob-in-bin", default=-1., type=float, @@ -20,6 +21,16 @@ parser.add_argument("--prob-out-bin", default=-1., type=float, help="Maximum fraction of the N(z) that should be inside other bins." "If <= 0, this will not be taken into account in the optimization.") +parser.add_argument('--in-bin-free', default=False, action='store_true', + help='Free up in-bin probability (default: False)') +parser.add_argument('--use-clustering', default=False, action='store_true', + help='Use clustering (default: False)') +parser.add_argument('--use-3x2', default=False, action='store_true', + help='Use 3x2 (default: False)') +parser.add_argument('--random-start', default=False, action='store_true', + help='Use random starting points (default: False)') +parser.add_argument('--random-seed', default=1234, type=int, + help='Random seed') o = parser.parse_args() nzs = np.load(o.nz_file) @@ -38,20 +49,41 @@ nzs = nzs * ng_tot / ng_tot_small # Initialize calculator -sc = SnCalc(zz, nzs, fsky=fsky) -sc.get_cl_matrix(fname_save=o.prefix_out + 'cl_wl.npz') +sc = SnCalc(zz, nzs, fsky=fsky, use_clustering=o.use_clustering, use_3x2=o.use_3x2) +if o.use_3x2: + suffix = 'cl_3x2.npz' +else: + if o.use_clustering: + suffix = 'cl_gc.npz' + else: + suffix = 'cl_wl.npz' +sc.get_cl_matrix(fname_save=o.prefix_out + suffix) # Initial edge guess (defined as having equal number of galaxies) -nz_tot = np.sum(nzs, axis=0) -cumulative_fraction = np.cumsum(nz_tot) / np.sum(nz_tot) -cumul_f = interp1d(zz, cumulative_fraction, bounds_error=False, - fill_value=(0, 1)) -edges_0 = np.array([brentq(lambda z : cumul_f(z) - q, 0, 2) - for q in (np.arange(o.n_bins-1)+1.)/o.n_bins]) - +if o.random_start: + np.random.seed(o.random_seed) + z_end = 1.5 + delta_z = z_end / (o.n_bins-1) + edges_0 = delta_z * (np.arange(o.n_bins-1) + np.random.rand(o.n_bins-1)) +else: + nz_tot = np.sum(nzs, axis=0) + cumulative_fraction = np.cumsum(nz_tot) / np.sum(nz_tot) + cumul_f = interp1d(zz, cumulative_fraction, bounds_error=False, + fill_value=(0, 1)) + edges_0 = np.array([brentq(lambda z : cumul_f(z) - q, 0, 2) + for q in (np.arange(o.n_bins-1)+1.)/o.n_bins]) + +if o.in_bin_free: + if o.random_start: + pin = np.random.rand() + else: + pin = 0.5 + edges_0 = np.concatenate((np.array([pin]), edges_0)) +print(edges_0) # Minimize # Binning parameters + params = assign_params_default.copy() if o.prob_in_bin > 0: params['p_inbin_thr'] = o.prob_in_bin @@ -59,15 +91,29 @@ if o.prob_out_bin > 0: params['p_outbin_thr'] = o.prob_out_bin params['use_p_outbin'] = True +if o.in_bin_free: + params['use_p_inbin'] = True + params['p_inbin_thr'] = edges_0[0] def minus_sn(edges, calc): - return -calc.get_sn_from_edges(edges, assign_params=params) + if o.in_bin_free: + if (edges[0] < 0) or (edges[0] > 1): + return 0. + params['p_inbin_thr'] = edges[0] + edges_use = edges[1:] + else: + edges_use = edges + return -calc.get_sn_from_edges(edges_use, assign_params=params) # Actual optimization res = minimize(minus_sn, edges_0, method='Powell', args=(sc,)) -edges_1 = res.x +if o.in_bin_free: + edges_1 = res.x[1:] + params['p_inbin_thr'] = res.x[0] +else: + edges_1 = res.x # Post-process: # S/N @@ -92,26 +138,16 @@ def get_bin_name(i): d_out = {'sn': sn, 'edges': edges_1} -plt.figure() +if o.in_bin_free: + d_out['p_in'] = params['p_inbin_thr'] + print("P_in: ", params['p_inbin_thr']) for (i, a), n in zip(assign, nz_best): name = get_bin_name(i) d_out[name+'_groups'] = a d_out[name+'_nz'] = n - print("Bin %d, groups: " % i, a) n_here = simps(n, x=zz) f = n_here / n_tot frac = "%.1lf" % (100 * f) - if i == -1: - bin_name = 'Trash bin' - else: - bin_name = f'Bin {i}' - plt.plot(zz, n/n_here, - label=bin_name + f' ({frac} %)') -plt.legend() -plt.xlabel(r'$z$', fontsize=15) -plt.ylabel(r'$p(z)$', fontsize=15) -plt.savefig(o.prefix_out + '_nz_summary.png', - bbox_inches='tight') - -np.savez(o.prefix_out + '_bin_info.npz', **d_out) -plt.show() + print("Bin %d, groups: " % i, a, frac) + +np.savez(o.prefix_out + o.suffix_out + '_bin_info.npz', **d_out) diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py index db83e1c8..24fed447 100644 --- a/the_atlantics/snc.py +++ b/the_atlantics/snc.py @@ -13,8 +13,9 @@ class SnCalc(object): edges_large = 3. - def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, - s_gamma=0.26, use_clustering=False, integrator='spline'): + def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, n_ell=100, + s_gamma=0.26, use_clustering=False, use_3x2=False, + integrator='qag_quad'):#spline'): """ S/N calculator Args: z_arr (array_like): array of redshifts at which all the N(z)s are @@ -32,6 +33,7 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, `use_clustering=False`). use_clustering (bool): if `True`, SNR will be computed for clustering instead of lensing. + use_3x2 (bool): if `True`, SNR will be computed for 3x2pt. integrator (string): CCL integration method. Either 'qag_quad' or 'spline'. """ @@ -39,9 +41,10 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, self.s_gamma = s_gamma self.fsky = fsky self.lmax = lmax - self.d_ell = d_ell - self.larr = np.arange(2, lmax, d_ell)+d_ell/2 - self.n_ell = len(self.larr) + self.n_ell = n_ell + ell_edges = np.logspace(2, np.log10(lmax), n_ell+1) + self.larr = 0.5*(ell_edges[1:]+ell_edges[:-1]) + self.d_ell = (ell_edges[1:]-ell_edges[:-1]) self.n_samples = len(nz_list) self.z_arr = z_arr self.nz_list = nz_list @@ -53,9 +56,14 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, self.n_dens = self.n_gals / (4*np.pi*self.fsky) self.cls = None self.use_clustering = use_clustering + self.use_3x2 = use_3x2 + if use_3x2: + self.n_tracers = 2*self.n_samples + else: + self.n_tracers = self.n_samples def _bz_model(self, cosmo, z): - return 0.95/ccl.growth_factor(cosmo, 1./(1+z)) + return 1./ccl.growth_factor(cosmo, 1./(1+z)) def get_cl_matrix(self, fname_save=None, recompute=False): """ Computes matrix of power spectra between all the initial groups. @@ -79,20 +87,27 @@ def get_cl_matrix(self, fname_save=None, recompute=False): sigma8=0.81) # Tracers - if self.use_clustering: - trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz), - bias=(self.z_arr, - self._bz_model(cosmo, - self.z_arr))) - for nz in self.nz_list] + bz = self._bz_model(cosmo, self.z_arr) + if self.use_3x2: + trs_gc = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz), + bias=(self.z_arr, bz)) + for nz in self.nz_list] + trs_wl = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz)) + for nz in self.nz_list] + trs = trs_gc + trs_wl else: - trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz)) - for nz in self.nz_list] + if self.use_clustering: + trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz), + bias=(self.z_arr, bz)) + for nz in self.nz_list] + else: + trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz)) + for nz in self.nz_list] # Cls - self.cls = np.zeros([self.n_ell, self.n_samples, self.n_samples]) - for i in range(self.n_samples): - for j in range(i, self.n_samples): + self.cls = np.zeros([self.n_ell, self.n_tracers, self.n_tracers]) + for i in range(self.n_tracers): + for j in range(i, self.n_tracers): cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr, limber_integration_method=self.integrator) self.cls[:, i, j] = cl @@ -126,22 +141,99 @@ def _get_cl_resamples(self, weights): """ if self.cls is None: self.get_cl_matrix() - return np.einsum('jl,km,ilm', weights, weights, self.cls) + if self.use_3x2: + w_use = np.tile(weights, (2, 2)) + else: + w_use = weights + return np.sum(np.sum(w_use[None, None, :, :] * + self.cls[:, :, None, :], + axis=-1)[:, None, :, :] * + w_use[None, :, :, None], + axis=-2) def _get_nl_resamples(self, n_dens): """ Gets noise contribution to C_ell matrix for resampled groups from list of number densities. """ - if self.use_clustering: - return np.diag(1./n_dens)[None, :, :] + if self.use_3x2: + n_diag = np.array(list(1./n_dens) + + list(self.s_gamma**2/n_dens)) + return np.diag(n_diag)[None, :, :] else: - return np.diag(self.s_gamma**2/n_dens)[None, :, :] + if self.use_clustering: + return np.diag(1./n_dens)[None, :, :] + else: + return np.diag(self.s_gamma**2/n_dens)[None, :, :] + + def get_sn_max(self, full_output=False): + """ Compute maximum signal-to-noise ratio given the input + groups + Args: + full_output (bool): if true, a dictionary with additional + information will be returned. Otherwise just total S/N. + Returns: + If `full_output=True`, dictionary containing S/N, power + spectra and noise spectra. Otherwise just S/N. + """ + weights = np.eye(self.n_samples) + n_dens = self.n_dens + return self.get_sn_wn(weights, n_dens, full_output=full_output) + + def get_kltrans(self): + """ Compute KL decomposition. + + Returns: + Fisher matrix element for each of the KL modes. + """ + nl = self._get_nl_resamples(self.n_dens) + nij = nl*np.ones(self.n_ell)[:, None, None] + cij = self.cls + nij + sij = cij - nij + inv_nij = np.linalg.inv(nij) + metric = inv_nij + + def change_basis(c, m, ev): + return np.array([np.diag(np.dot(ev[i].T, + np.dot(m[i], + np.dot(c[i], + np.dot(m[i], + ev[i]))))) + for i in range(self.n_ell)]) + + def diagonalize(c, m): + im = np.linalg.inv(m) + ll = np.linalg.cholesky(m) + ill = np.linalg.cholesky(im) + cl = np.array([np.dot(np.transpose(ll[i]), + np.dot(c[i], ll[i])) + for i in range(self.n_ell)]) + c_p, v = np.linalg.eigh(cl) + ev = np.array([np.dot(np.transpose(ill[i]), v[i]) + for i in range(self.n_ell)]) + # iden = change_basis(im, m, ev) + return ev, c_p + + e_v, c_p = diagonalize(cij, metric) + s_p = change_basis(sij, metric, e_v) + nmodes_l = (self.fsky * self.d_ell * (self.larr+0.5)) + fish_kl = nmodes_l[:, None]*(s_p/c_p)**2 + isort = np.argsort(-np.sum(fish_kl, axis=0)) + e_o = e_v[:, :, isort] + # f_o = np.array([np.dot(inv_nij[l], e_o[l, :, :]) + # for l in range(self.n_ell)]) + c_p = change_basis(cij, metric, e_o) + s_p = change_basis(sij, metric, e_o) + + fish_kl = nmodes_l[:, None]*(s_p/c_p)**2 + fish_permode = np.sum(fish_kl, axis=0) + fish_cumul = np.cumsum(fish_permode) + return fish_kl, fish_permode, fish_cumul def get_sn_wn(self, weights, n_dens, full_output=False): """ Compute signal-to-noise ratio from weights matrix and number densities of the resampled groups. Args: - weights (array_like): weights matrix of shape + weights (array_like): weights matrix of shape `(N_resample, N_initial)`, where `N_resample` is the number of new groups, and `N_initial` is the original number of groups. Each entry corresponds to the weight @@ -152,7 +244,7 @@ def get_sn_wn(self, weights, n_dens, full_output=False): full_output (bool): if true, a dictionary with additional information will be returned. Otherwise just total S/N. Returns: - If `full_output=True`, dictionary containing S/N, power + If `full_output=True`, dictionary containing S/N, power spectra and noise spectra. Otherwise just S/N. """ sl = self._get_cl_resamples(weights) @@ -163,8 +255,8 @@ def get_sn_wn(self, weights, n_dens, full_output=False): sn2_ell = np.sum(sn2_1pt[:, :, :, None] * sn2_1pt[:, None, :, :], axis=2) trsn2_ell = np.trace(sn2_ell, axis1=1, axis2=2) - snr = np.sqrt(np.sum(trsn2_ell * (2*self.larr + 1) * - self.d_ell / self.fsky)) + snr = np.sqrt(np.sum(trsn2_ell * (self.larr + 0.5) * + self.d_ell * self.fsky)) if full_output: dret = {'snr': snr, 'cl': sl, 'nl': nl, 'ls': self.larr} @@ -192,9 +284,12 @@ def get_sn(self, assign, full_output=False): def check_edges(self, edges): """ Returns `True` if there's something wrong with the edges. """ + swapped = False + if np.ndim(edges) > 0: + swapped = np.any(np.diff(edges) < 0) return np.any(edges < 0) or \ np.any(edges > self.edges_large) or \ - np.any(np.diff(edges) < 0) + swapped def get_nzs_from_edges(self, edges, assign_params=assign_params_default): assign = self.assign_from_edges(edges, assign_params=assign_params) @@ -215,6 +310,7 @@ def assign_from_edges(self, edges, assign_params=assign_params_default, Returns: List of assignment arrays ready to be used in e.g. `get_sn`. """ + edges = np.atleast_1d(edges) nbins = len(edges) + 1 # Bin IDs based on mean z ids = np.digitize(self.z_means, bins=edges) diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py index d83d3d7f..986c3f8e 100644 --- a/tomo_challenge/classifiers/ComplexSOM.py +++ b/tomo_challenge/classifiers/ComplexSOM.py @@ -23,8 +23,9 @@ from rpy2.robjects import pandas2ri from rpy2.robjects.conversion import localconverter from ..snc import SnCalc -from scipy.optimize import minimize -from scipy.integrate import simps +from scipy.interpolate import interp1d +from scipy.optimize import minimize, brentq +from scipy.integrate import simps import pickle #Check that all the needed packages are installed @@ -126,9 +127,9 @@ def train (self, training_data, training_z): group_type = 'redshift' #Define the assign_params variable, used in bin optimisation assign_params = {'p_inbin_thr': 0.5, - 'p_outbin_thr': 0.2, - 'use_p_inbin': use_inbin, - 'use_p_outbin': use_outbin} + 'p_outbin_thr': 0.2, + 'use_p_inbin': use_inbin, + 'use_p_outbin': use_outbin} #Define the redshift summary statistics (used for making groups in the 'redshift' case property_labels = ("mean_z_true","med_z_true","sd_z_true","mad_z_true","iqr_z_true") @@ -303,8 +304,8 @@ def train (self, training_data, training_z): z_arr = np.linspace(0, 2, 124) z_cen = z_arr[0:-1]+(z_arr[1]-z_arr[0])/2. #Construct the per-group Nz - nzs = [(np.histogram(training_z[train_group == group+1],z_arr)[0]) for group in np.arange(num_groups)[np.array(tab)!=0]] - + nzs = [(np.histogram(training_z[train_group == group+1],z_arr)[0]) + for group in np.arange(num_groups)[np.array(tab)!=0]] #np.save('plots/nzs.npy', nzs) #Update the fsky @@ -315,35 +316,42 @@ def train (self, training_data, training_z): area_arcmin = (180*60/np.pi)**2*4*np.pi*fsky ng_tot_goal = ndens_arcmin * area_arcmin ng_tot_curr = np.sum(np.array(tab)) - nzs = [nz * ng_tot_goal / ng_tot_curr for nz in nzs] - + nzs = np.array([nz * ng_tot_goal / ng_tot_curr for nz in nzs]) #np.save('plots/nzs_norm.npy', nzs) - - #print(nzs) # Now initialize a S/N calculator for these initial groups. os.system('rm wl_nb%d.npz' % n_bin) if metric == 'SNR_ww': - c_wl = SnCalc(z_cen, nzs, use_clustering=False,fsky=fsky) + c_wl = SnCalc(z_cen, nzs, use_clustering=False, fsky=fsky) else: c_wl = SnCalc(z_cen, nzs, use_clustering=True,fsky=fsky) - print("Initializing WL") + print("Initializing Cls") c_wl.get_cl_matrix(fname_save='wl_nb%d.npz' % n_bin) + + # Initial edge guess + nz_tot = np.sum(nzs, axis=0) + cumulative_fraction = np.cumsum(nz_tot) / np.sum(nz_tot) + cumul_f = interp1d(zz, cumulative_fraction, bounds_error=False, + fill_value=(0, 1)) + edges_0 = np.array([brentq(lambda z : cumul_f(z) - q, 0, 2) + for q in (np.arange(o.n_bins-1)+1.)/o.n_bins]) + # Finally, let's write the function to minimize and optimize for a 4-bin case. def minus_sn(edges, calc): return -calc.get_sn_from_edges(edges,assign_params=assign_params) print("Optimizing WL") - edges_0 = np.linspace(0, 2, n_bin-1) res = minimize(minus_sn, edges_0, method='Powell', args=(c_wl,)) - print("WL final edges: ", res.x) - print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x,assign_params=assign_params)*np.sqrt(0.25/fsky)) + print("Final edges: ", res.x) + print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x, + assign_params=assign_params)) print(" ") - print(res) #Construct the per-group Nz print("Outputting trained SOM and redshift ordering of SOM groups") - groups_in_tomo = c_wl.assign_from_edges(res.x, get_ids=True) + groups_in_tomo = c_wl.assign_from_edges(res.x, + assign_params=assign_params, + get_ids=True) group_bins = np.zeros(num_groups) for bin_no, groups in groups_in_tomo: print(bin_no, groups) @@ -392,8 +400,10 @@ def apply (self, data): #Assign the sources, by group, to tomographic bins print("Output source tomographic bin assignments") - valid_bin = base.unsplit(IntVector(self.group_bins),FactorVector(valid_som.rx2['clust.classif'], - levels=base.seq(num_groups)),drop=False) + valid_bin = base.unsplit(IntVector(self.group_bins), + FactorVector(valid_som.rx2['clust.classif'], + levels=base.seq(num_groups)), + drop=False) valid_bin = np.array(valid_bin) return valid_bin diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py index db83e1c8..24fed447 100644 --- a/tomo_challenge/snc.py +++ b/tomo_challenge/snc.py @@ -13,8 +13,9 @@ class SnCalc(object): edges_large = 3. - def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, - s_gamma=0.26, use_clustering=False, integrator='spline'): + def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, n_ell=100, + s_gamma=0.26, use_clustering=False, use_3x2=False, + integrator='qag_quad'):#spline'): """ S/N calculator Args: z_arr (array_like): array of redshifts at which all the N(z)s are @@ -32,6 +33,7 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, `use_clustering=False`). use_clustering (bool): if `True`, SNR will be computed for clustering instead of lensing. + use_3x2 (bool): if `True`, SNR will be computed for 3x2pt. integrator (string): CCL integration method. Either 'qag_quad' or 'spline'. """ @@ -39,9 +41,10 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, self.s_gamma = s_gamma self.fsky = fsky self.lmax = lmax - self.d_ell = d_ell - self.larr = np.arange(2, lmax, d_ell)+d_ell/2 - self.n_ell = len(self.larr) + self.n_ell = n_ell + ell_edges = np.logspace(2, np.log10(lmax), n_ell+1) + self.larr = 0.5*(ell_edges[1:]+ell_edges[:-1]) + self.d_ell = (ell_edges[1:]-ell_edges[:-1]) self.n_samples = len(nz_list) self.z_arr = z_arr self.nz_list = nz_list @@ -53,9 +56,14 @@ def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, d_ell=10, self.n_dens = self.n_gals / (4*np.pi*self.fsky) self.cls = None self.use_clustering = use_clustering + self.use_3x2 = use_3x2 + if use_3x2: + self.n_tracers = 2*self.n_samples + else: + self.n_tracers = self.n_samples def _bz_model(self, cosmo, z): - return 0.95/ccl.growth_factor(cosmo, 1./(1+z)) + return 1./ccl.growth_factor(cosmo, 1./(1+z)) def get_cl_matrix(self, fname_save=None, recompute=False): """ Computes matrix of power spectra between all the initial groups. @@ -79,20 +87,27 @@ def get_cl_matrix(self, fname_save=None, recompute=False): sigma8=0.81) # Tracers - if self.use_clustering: - trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz), - bias=(self.z_arr, - self._bz_model(cosmo, - self.z_arr))) - for nz in self.nz_list] + bz = self._bz_model(cosmo, self.z_arr) + if self.use_3x2: + trs_gc = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz), + bias=(self.z_arr, bz)) + for nz in self.nz_list] + trs_wl = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz)) + for nz in self.nz_list] + trs = trs_gc + trs_wl else: - trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz)) - for nz in self.nz_list] + if self.use_clustering: + trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz), + bias=(self.z_arr, bz)) + for nz in self.nz_list] + else: + trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz)) + for nz in self.nz_list] # Cls - self.cls = np.zeros([self.n_ell, self.n_samples, self.n_samples]) - for i in range(self.n_samples): - for j in range(i, self.n_samples): + self.cls = np.zeros([self.n_ell, self.n_tracers, self.n_tracers]) + for i in range(self.n_tracers): + for j in range(i, self.n_tracers): cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr, limber_integration_method=self.integrator) self.cls[:, i, j] = cl @@ -126,22 +141,99 @@ def _get_cl_resamples(self, weights): """ if self.cls is None: self.get_cl_matrix() - return np.einsum('jl,km,ilm', weights, weights, self.cls) + if self.use_3x2: + w_use = np.tile(weights, (2, 2)) + else: + w_use = weights + return np.sum(np.sum(w_use[None, None, :, :] * + self.cls[:, :, None, :], + axis=-1)[:, None, :, :] * + w_use[None, :, :, None], + axis=-2) def _get_nl_resamples(self, n_dens): """ Gets noise contribution to C_ell matrix for resampled groups from list of number densities. """ - if self.use_clustering: - return np.diag(1./n_dens)[None, :, :] + if self.use_3x2: + n_diag = np.array(list(1./n_dens) + + list(self.s_gamma**2/n_dens)) + return np.diag(n_diag)[None, :, :] else: - return np.diag(self.s_gamma**2/n_dens)[None, :, :] + if self.use_clustering: + return np.diag(1./n_dens)[None, :, :] + else: + return np.diag(self.s_gamma**2/n_dens)[None, :, :] + + def get_sn_max(self, full_output=False): + """ Compute maximum signal-to-noise ratio given the input + groups + Args: + full_output (bool): if true, a dictionary with additional + information will be returned. Otherwise just total S/N. + Returns: + If `full_output=True`, dictionary containing S/N, power + spectra and noise spectra. Otherwise just S/N. + """ + weights = np.eye(self.n_samples) + n_dens = self.n_dens + return self.get_sn_wn(weights, n_dens, full_output=full_output) + + def get_kltrans(self): + """ Compute KL decomposition. + + Returns: + Fisher matrix element for each of the KL modes. + """ + nl = self._get_nl_resamples(self.n_dens) + nij = nl*np.ones(self.n_ell)[:, None, None] + cij = self.cls + nij + sij = cij - nij + inv_nij = np.linalg.inv(nij) + metric = inv_nij + + def change_basis(c, m, ev): + return np.array([np.diag(np.dot(ev[i].T, + np.dot(m[i], + np.dot(c[i], + np.dot(m[i], + ev[i]))))) + for i in range(self.n_ell)]) + + def diagonalize(c, m): + im = np.linalg.inv(m) + ll = np.linalg.cholesky(m) + ill = np.linalg.cholesky(im) + cl = np.array([np.dot(np.transpose(ll[i]), + np.dot(c[i], ll[i])) + for i in range(self.n_ell)]) + c_p, v = np.linalg.eigh(cl) + ev = np.array([np.dot(np.transpose(ill[i]), v[i]) + for i in range(self.n_ell)]) + # iden = change_basis(im, m, ev) + return ev, c_p + + e_v, c_p = diagonalize(cij, metric) + s_p = change_basis(sij, metric, e_v) + nmodes_l = (self.fsky * self.d_ell * (self.larr+0.5)) + fish_kl = nmodes_l[:, None]*(s_p/c_p)**2 + isort = np.argsort(-np.sum(fish_kl, axis=0)) + e_o = e_v[:, :, isort] + # f_o = np.array([np.dot(inv_nij[l], e_o[l, :, :]) + # for l in range(self.n_ell)]) + c_p = change_basis(cij, metric, e_o) + s_p = change_basis(sij, metric, e_o) + + fish_kl = nmodes_l[:, None]*(s_p/c_p)**2 + fish_permode = np.sum(fish_kl, axis=0) + fish_cumul = np.cumsum(fish_permode) + return fish_kl, fish_permode, fish_cumul def get_sn_wn(self, weights, n_dens, full_output=False): """ Compute signal-to-noise ratio from weights matrix and number densities of the resampled groups. Args: - weights (array_like): weights matrix of shape + weights (array_like): weights matrix of shape `(N_resample, N_initial)`, where `N_resample` is the number of new groups, and `N_initial` is the original number of groups. Each entry corresponds to the weight @@ -152,7 +244,7 @@ def get_sn_wn(self, weights, n_dens, full_output=False): full_output (bool): if true, a dictionary with additional information will be returned. Otherwise just total S/N. Returns: - If `full_output=True`, dictionary containing S/N, power + If `full_output=True`, dictionary containing S/N, power spectra and noise spectra. Otherwise just S/N. """ sl = self._get_cl_resamples(weights) @@ -163,8 +255,8 @@ def get_sn_wn(self, weights, n_dens, full_output=False): sn2_ell = np.sum(sn2_1pt[:, :, :, None] * sn2_1pt[:, None, :, :], axis=2) trsn2_ell = np.trace(sn2_ell, axis1=1, axis2=2) - snr = np.sqrt(np.sum(trsn2_ell * (2*self.larr + 1) * - self.d_ell / self.fsky)) + snr = np.sqrt(np.sum(trsn2_ell * (self.larr + 0.5) * + self.d_ell * self.fsky)) if full_output: dret = {'snr': snr, 'cl': sl, 'nl': nl, 'ls': self.larr} @@ -192,9 +284,12 @@ def get_sn(self, assign, full_output=False): def check_edges(self, edges): """ Returns `True` if there's something wrong with the edges. """ + swapped = False + if np.ndim(edges) > 0: + swapped = np.any(np.diff(edges) < 0) return np.any(edges < 0) or \ np.any(edges > self.edges_large) or \ - np.any(np.diff(edges) < 0) + swapped def get_nzs_from_edges(self, edges, assign_params=assign_params_default): assign = self.assign_from_edges(edges, assign_params=assign_params) @@ -215,6 +310,7 @@ def assign_from_edges(self, edges, assign_params=assign_params_default, Returns: List of assignment arrays ready to be used in e.g. `get_sn`. """ + edges = np.atleast_1d(edges) nbins = len(edges) + 1 # Bin IDs based on mean z ids = np.digitize(self.z_means, bins=edges) From 27ed0ddb46d1d73dded2eef4dd81036df5736065 Mon Sep 17 00:00:00 2001 From: David Alonso Date: Fri, 11 Sep 2020 17:48:34 +0100 Subject: [PATCH 32/35] try --- the_atlantics/snc.py | 7 +++++-- tomo_challenge/snc.py | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py index 24fed447..4c1d537e 100644 --- a/the_atlantics/snc.py +++ b/the_atlantics/snc.py @@ -108,8 +108,11 @@ def get_cl_matrix(self, fname_save=None, recompute=False): self.cls = np.zeros([self.n_ell, self.n_tracers, self.n_tracers]) for i in range(self.n_tracers): for j in range(i, self.n_tracers): - cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr, - limber_integration_method=self.integrator) + try: + cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr, + limber_integration_method=self.integrator) + except: + cl = np.zeros(len(self.larr)) self.cls[:, i, j] = cl if j != i: self.cls[:, j, i] = cl diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py index 24fed447..4c1d537e 100644 --- a/tomo_challenge/snc.py +++ b/tomo_challenge/snc.py @@ -108,8 +108,11 @@ def get_cl_matrix(self, fname_save=None, recompute=False): self.cls = np.zeros([self.n_ell, self.n_tracers, self.n_tracers]) for i in range(self.n_tracers): for j in range(i, self.n_tracers): - cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr, - limber_integration_method=self.integrator) + try: + cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr, + limber_integration_method=self.integrator) + except: + cl = np.zeros(len(self.larr)) self.cls[:, i, j] = cl if j != i: self.cls[:, j, i] = cl From 27d519c839eb2bcbeab0d02dd058a808b071eaba Mon Sep 17 00:00:00 2001 From: David Alonso Date: Fri, 11 Sep 2020 18:33:17 +0100 Subject: [PATCH 33/35] fix zz --- tomo_challenge/classifiers/ComplexSOM.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py index 08f232dd..ad4d0a5b 100644 --- a/tomo_challenge/classifiers/ComplexSOM.py +++ b/tomo_challenge/classifiers/ComplexSOM.py @@ -392,7 +392,7 @@ def train (self, training_data, training_z): # Initial edge guess nz_tot = np.sum(nzs, axis=0) cumulative_fraction = np.cumsum(nz_tot) / np.sum(nz_tot) - cumul_f = interp1d(zz, cumulative_fraction, bounds_error=False, + cumul_f = interp1d(z_cen, cumulative_fraction, bounds_error=False, fill_value=(0, 1)) edges_0 = np.array([brentq(lambda z : cumul_f(z) - q, 0, 2) for q in (np.arange(o.n_bins-1)+1.)/o.n_bins]) From e79cfab85edcfc00687d18dd77061aba86e1109e Mon Sep 17 00:00:00 2001 From: Angus Wright Date: Mon, 14 Sep 2020 15:41:19 +0200 Subject: [PATCH 34/35] updated complexsom for sizes inclusion --- example/ComplexSOM.yaml | 6 ++++-- tomo_challenge/classifiers/ComplexSOM.py | 7 +++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/example/ComplexSOM.yaml b/example/ComplexSOM.yaml index e6aea060..0171d217 100644 --- a/example/ComplexSOM.yaml +++ b/example/ComplexSOM.yaml @@ -11,12 +11,14 @@ run: ComplexSOM: run_5: # These settings are sent to the classifier - testing: True + testing: False bins: 5 som_dim: [51,51] num_groups: 100 num_threads: 32 group_type: 'redshift' + whiten: False + redshift_propset: 2 data_threshold: [0,30] sparse_frac: 1.0 plots: True @@ -29,7 +31,7 @@ run: # as well as the magnitudes colors: False errors: False - sizes: False + sizes: True run_10: # These settings are sent to the classifier testing: False diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py index 2ee47bc6..1d19fd0c 100644 --- a/tomo_challenge/classifiers/ComplexSOM.py +++ b/tomo_challenge/classifiers/ComplexSOM.py @@ -148,6 +148,8 @@ def train (self, training_data, training_z): elif redshift_propset==3: property_labels = StrVector(("quan_05","quan_16","quan_50","quan_84","quan_95")) property_expressions = ("quantile(data$redshift_true,probs=c(5,16,50,84,95)/100)") + property_labels = StrVector(("quan_05","quan_16","quan_84","quan_95")) + property_expressions = ("quantile(data$redshift_true,probs=c(5,16,84,95)/100)") else: property_labels = StrVector(("mean_z_true","med_z_true","sd_z_true","mad_z_true","iqr_z_true")) property_expressions = StrVector(("mean(data$redshift_true)","median(data$redshift_true)","sd(data$redshift_true)", @@ -214,7 +216,7 @@ def train (self, training_data, training_z): #Define the output SOM name if self.opt['sizes'] == True: - som_outname = f"SOM_{som_dim}_{self.bands}_withsize.pkl" + som_outname = f"SOM_{som_dim}_{self.bands}_withsizelin.pkl" else: som_outname = f"SOM_{som_dim}_{self.bands}.pkl" @@ -257,9 +259,6 @@ def train (self, training_data, training_z): n_cores=num_threads,train_expr=StrVector(expressions),train_sparse=False,sparse_frac=sparse_frac) print(base.length(som.rx2['unit.classif'])) #Output the SOM - #base.save(som,file=som_outname) - #with open(f"test_{som_outname}", 'wb') as f: - # pickle.dump(som, f) with open(som_outname, 'wb') as f: pickle.dump(som, f) else: From 6b41b6a997a3a3e10f1ec9631fe65382fd9b1f2b Mon Sep 17 00:00:00 2001 From: Angus Wright Date: Mon, 14 Sep 2020 23:39:54 +0200 Subject: [PATCH 35/35] updated yaml --- example/ComplexSOM.yaml => ComplexSOM.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) rename example/ComplexSOM.yaml => ComplexSOM.yaml (95%) diff --git a/example/ComplexSOM.yaml b/ComplexSOM.yaml similarity index 95% rename from example/ComplexSOM.yaml rename to ComplexSOM.yaml index 0171d217..5b76fa08 100644 --- a/example/ComplexSOM.yaml +++ b/ComplexSOM.yaml @@ -14,7 +14,7 @@ run: testing: False bins: 5 som_dim: [51,51] - num_groups: 100 + num_groups: 200 num_threads: 32 group_type: 'redshift' whiten: False @@ -40,6 +40,7 @@ run: num_groups: 200 num_threads: 32 group_type: 'redshift' + redshift_propset: 2 data_threshold: [0,30] sparse_frac: 1.0 plots: True @@ -52,5 +53,5 @@ run: # as well as the magnitudes colors: False errors: False - sizes: False + sizes: True