1- """
1+ """Smoother utility.
2+
23This file contains the smoothing utility functions. We have a number of
34possible smoothers to choose from: windowed average, local weighted regression,
45and a causal Savitzky-Golay filter.
1011docstrings for details.
1112"""
1213
14+ from typing import Union
1315import warnings
1416
1517import numpy as np
1618import pandas as pd
1719
1820
1921class Smoother :
20- """
22+ """Smoother class.
23+
2124 This is the smoothing utility class. This class holds the parameter settings for its smoother
2225 methods and provides reasonable defaults. Basic usage can be found in the examples below.
2326
@@ -36,9 +39,13 @@ class Smoother:
3639 Descriptions of the smoothers are available in the doc strings. Full mathematical
3740 details are in: https://github.com/cmu-delphi/covidcast-modeling/ in the folder
3841 'indicator_smoother'.
42+ poly_fit_degree: int
43+ A parameter for the 'savgol' smoother which sets the degree of the polynomial fit.
3944 window_length: int
40- The length of the averaging window for 'savgol' and 'moving_average'.
41- This value is in the units of the data, which tends to be days.
45+ The length of the fitting window for 'savgol' and the averaging window 'moving_average'.
46+ This value is in the units provided by the data, which are likely to be days for Delphi.
47+ Note that if window_length is smaller than the length of the signal, then only the
48+ imputation method is run on the signal.
4249 gaussian_bandwidth: float or None
4350 If float, all regression is done with Gaussian weights whose variance is
4451 half the gaussian_bandwidth. If None, performs unweighted regression. (Applies
@@ -60,14 +67,19 @@ class Smoother:
6067 The smallest value to allow in a signal. If None, there is no smallest value.
6168 Currently only implemented for 'left_gauss_linear'. This should probably not be in the scope
6269 of the smoothing utility.
63- poly_fit_degree: int
64- A parameter for the 'savgol' smoother which sets the degree of the polynomial fit.
70+ boundary_method: {'shortened_window', 'identity', 'nan'}
71+ Determines how the 'savgol' method handles smoothing at the (left) boundary, where the past
72+ data length is shorter than the window_length parameter. If 'shortened_window', it uses the
73+ maximum window available; at the very edge (generally up to poly_fit_degree) it keeps the
74+ same value as the raw signal. If 'identity', it just keeps the raw signal. If 'nan', it
75+ writes nans. For the other smoothing methods, 'moving_average' writes nans and
76+ 'left_gauss_linear' uses a shortened window.
6577
6678 Methods
6779 ----------
6880 smooth: np.ndarray or pd.Series
69- Takes a 1D signal and returns a smoothed version. The input and the output have the same length
70- and type.
81+ Takes a 1D signal and returns a smoothed version. The input and the output have the same
82+ length and type.
7183
7284 Example Usage
7385 -------------
@@ -108,6 +120,7 @@ def __init__(
108120 minval = None ,
109121 boundary_method = "shortened_window" ,
110122 ):
123+ """See class docstring."""
111124 self .smoother_name = smoother_name
112125 self .poly_fit_degree = poly_fit_degree
113126 self .window_length = window_length
@@ -118,35 +131,38 @@ def __init__(
118131
119132 valid_smoothers = {"savgol" , "left_gauss_linear" , "moving_average" , "identity" }
120133 valid_impute_methods = {"savgol" , "zeros" , None }
134+ valid_boundary_methods = {"shortened_window" , "identity" , "nan" }
121135 if self .smoother_name not in valid_smoothers :
122- raise ValueError ("Invalid smoother name given." )
136+ raise ValueError ("Invalid smoother_name given." )
123137 if self .impute_method not in valid_impute_methods :
124- raise ValueError ("Invalid impute method given." )
138+ raise ValueError ("Invalid impute_method given." )
139+ if self .boundary_method not in valid_boundary_methods :
140+ raise ValueError ("Invalid boundary_method given." )
125141
126142 if smoother_name == "savgol" :
143+ # The polynomial fitting is done on a past window of size window_length
144+ # including the current day value.
127145 self .coeffs = self .savgol_coeffs (- self .window_length + 1 , 0 )
128146 else :
129147 self .coeffs = None
130148
131- def smooth (self , signal ):
132- """
133- The major workhorse smoothing function. Can use one of three smoothing
134- methods, as specified by the class variable smoother_name.
149+ def smooth (self , signal : Union [np .ndarray , pd .Series ]) -> Union [np .ndarray , pd .Series ]:
150+ """Apply a smoother to a signal.
151+
152+ The major workhorse smoothing function. Imputes the nans and then applies
153+ a smoother to the signal.
135154
136155 Parameters
137156 ----------
138157 signal: np.ndarray or pd.Series
139158 A 1D signal to be smoothed.
140159
160+ Returns
161+ ----------
141162 signal_smoothed: np.ndarray or pd.Series
142163 A smoothed 1D signal. Returns an array of the same type and length as
143164 the input.
144165 """
145- if len (signal ) < self .window_length :
146- raise ValueError (
147- "The window_length must be smaller than the length of the signal."
148- )
149-
150166 is_pandas_series = isinstance (signal , pd .Series )
151167 signal = signal .to_numpy () if is_pandas_series else signal
152168
@@ -158,14 +174,16 @@ def smooth(self, signal):
158174 signal_smoothed = self .left_gauss_linear_smoother (signal )
159175 elif self .smoother_name == "moving_average" :
160176 signal_smoothed = self .moving_average_smoother (signal )
161- elif self . smoother_name == "identity" :
162- signal_smoothed = signal
177+ else :
178+ signal_smoothed = signal . copy ()
163179
164- return signal_smoothed if not is_pandas_series else pd .Series (signal_smoothed )
180+ signal_smoothed = signal_smoothed if not is_pandas_series else pd .Series (signal_smoothed )
181+ return signal_smoothed
165182
166183 def impute (self , signal ):
167- """
168- Imputes the nan values in the signal.
184+ """Impute the nan values in the signal.
185+
186+ See the class docstring for an explanation of the impute methods.
169187
170188 Parameters
171189 ----------
@@ -191,8 +209,7 @@ def impute(self, signal):
191209 return imputed_signal
192210
193211 def moving_average_smoother (self , signal ):
194- """
195- Computes a moving average on the signal.
212+ """Compute a moving average on the signal.
196213
197214 Parameters
198215 ----------
@@ -219,11 +236,10 @@ def moving_average_smoother(self, signal):
219236 return signal_smoothed
220237
221238 def left_gauss_linear_smoother (self , signal ):
222- """
223- DEPRECATED: This method is available to help sanity check the 'savgol' method.
224- It is a little slow, so use 'savgol' with poly_fit_degree=1 instead.
239+ """Smooth the y-values using a local linear regression with Gaussian weights.
225240
226- Smooth the y-values using a local linear regression with Gaussian weights.
241+ DEPRECATED: This method is available to help sanity check the 'savgol' method.
242+ Use 'savgol' with poly_fit_degree=1 and the appropriate gaussian_bandwidth instead.
227243
228244 At each time t, we use the data from times 1, ..., t-dt, weighted
229245 using the Gaussian kernel, to produce the estimate at time t.
@@ -263,7 +279,8 @@ def left_gauss_linear_smoother(self, signal):
263279 return signal_smoothed
264280
265281 def savgol_predict (self , signal ):
266- """
282+ """Predict a single value using the savgol method.
283+
267284 Fits a polynomial through the values given by the signal and returns the value
268285 of the polynomial at the right-most signal-value. More precisely, fits a polynomial
269286 f(t) of degree poly_fit_degree through the points signal[-n], signal[-n+1] ..., signal[-1],
@@ -279,14 +296,15 @@ def savgol_predict(self, signal):
279296 predicted_value: float
280297 The anticipated value that comes after the end of the signal based on a polynomial fit.
281298 """
299+ # Add one
282300 coeffs = self .savgol_coeffs (- len (signal ) + 1 , 0 )
283301 predicted_value = signal @ coeffs
284302 return predicted_value
285303
286304 def savgol_coeffs (self , nl , nr ):
287- """
288- Solves for the Savitzky-Golay coefficients. The coefficients c_i
289- give a filter so that
305+ """Solve for the Savitzky-Golay coefficients.
306+
307+ The coefficients c_i give a filter so that
290308 y = sum_{i=-{n_l}}^{n_r} c_i x_i
291309 is the value at 0 (thus the constant term) of the polynomial fit
292310 through the points {x_i}. The coefficients are c_i are calculated as
@@ -298,9 +316,9 @@ def savgol_coeffs(self, nl, nr):
298316 Parameters
299317 ----------
300318 nl: int
301- The left window bound for the polynomial fit.
319+ The left window bound for the polynomial fit, inclusive .
302320 nr: int
303- The right window bound for the polynomial fit.
321+ The right window bound for the polynomial fit, inclusive .
304322 poly_fit_degree: int
305323 The degree of the polynomial to be fit.
306324 gaussian_bandwidth: float or None
@@ -336,14 +354,10 @@ def savgol_coeffs(self, nl, nr):
336354 return coeffs
337355
338356 def savgol_smoother (self , signal ):
339- """
340- Returns a specific type of convolution of the 1D signal with the 1D signal
341- coeffs, respecting boundary effects. That is, the output y is
342- signal_smoothed_i = signal_i
343- signal_smoothed_i = sum_{j=0}^n coeffs_j signal_{i+j}, if i >= len(coeffs) - 1
344- In words, entries close to the left boundary are not smoothed, the window does
345- not proceed over the right boundary, and the rest of the values are regular
346- convolution.
357+ """Smooth signal with the savgol smoother.
358+
359+ Returns a convolution of the 1D signal with the Savitzky-Golay coefficients, respecting
360+ boundary effects. For an explanation of boundary effects methods, see the class docstring.
347361
348362 Parameters
349363 ----------
@@ -355,15 +369,14 @@ def savgol_smoother(self, signal):
355369 signal_smoothed: np.ndarray
356370 A smoothed 1D signal of same length as signal.
357371 """
358-
359- # reverse because np.convolve reverses the second argument
372+ # Reverse because np.convolve reverses the second argument
360373 temp_reversed_coeffs = np .array (list (reversed (self .coeffs )))
361374
362- # does the majority of the smoothing, with the calculated coefficients
375+ # Smooth the part of the signal away from the boundary first
363376 signal_padded = np .append (np .nan * np .ones (len (self .coeffs ) - 1 ), signal )
364377 signal_smoothed = np .convolve (signal_padded , temp_reversed_coeffs , mode = "valid" )
365378
366- # this section handles the smoothing behavior at the (left) boundary:
379+ # This section handles the smoothing behavior at the (left) boundary:
367380 # - shortened_window (default) applies savgol with a smaller window to do the fit
368381 # - identity keeps the original signal (doesn't smooth)
369382 # - nan writes nans
@@ -372,22 +385,23 @@ def savgol_smoother(self, signal):
372385 if ix == 0 :
373386 signal_smoothed [ix ] = signal [ix ]
374387 else :
388+ # At the very edge, the design matrix is often singular, in which case
389+ # we just fall back to the raw signal
375390 try :
376- signal_smoothed [ix ] = self .savgol_predict (signal [: ( ix + 1 ) ])
377- except np .linalg .LinAlgError : # for small ix, the design matrix is singular
391+ signal_smoothed [ix ] = self .savgol_predict (signal [:ix + 1 ])
392+ except np .linalg .LinAlgError :
378393 signal_smoothed [ix ] = signal [ix ]
379394 return signal_smoothed
380395 elif self .boundary_method == "identity" :
381- for ix in range (len (self .coeffs )):
396+ for ix in range (min ( len (self .coeffs ), len ( signal ) )):
382397 signal_smoothed [ix ] = signal [ix ]
383398 return signal_smoothed
384399 elif self .boundary_method == "nan" :
385400 return signal_smoothed
386- else :
387- raise ValueError ("Unknown boundary method." )
388401
389402 def savgol_impute (self , signal ):
390- """
403+ """Impute the nan values in signal using savgol.
404+
391405 This method fills the nan values in the signal with an M-degree polynomial fit
392406 on a rolling window of the immediate past up to window_length data points.
393407
0 commit comments