diff --git a/machine_learning/linear_regression.py b/machine_learning/linear_regression.py index 5b1e663116cc..d0db65a69b26 100644 --- a/machine_learning/linear_regression.py +++ b/machine_learning/linear_regression.py @@ -21,127 +21,66 @@ def collect_dataset(): - """Collect dataset of CSGO - The dataset contains ADR vs Rating of a Player - :return : dataset obtained from the link, as matrix - """ + """Collect dataset of CSGO (ADR vs Rating).""" response = httpx.get( - "https://raw.githubusercontent.com/yashLadha/The_Math_of_Intelligence/" - "master/Week1/ADRvsRating.csv", + "https://raw.githubusercontent.com/yashLadha/The_Math_of_Intelligence/master/Week1/ADRvsRating.csv", timeout=10, ) - lines = response.text.splitlines() - data = [] - for item in lines: - item = item.split(",") - data.append(item) - data.pop(0) # This is for removing the labels from the list - dataset = np.matrix(data) - return dataset - - -def run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta): - """Run steep gradient descent and updates the Feature vector accordingly_ - :param data_x : contains the dataset - :param data_y : contains the output associated with each data-entry - :param len_data : length of the data_ - :param alpha : Learning rate of the model - :param theta : Feature vector (weight's for our model) - ;param return : Updated Feature's, using - curr_features - alpha_ * gradient(w.r.t. feature) - >>> import numpy as np - >>> data_x = np.array([[1, 2], [3, 4]]) - >>> data_y = np.array([5, 6]) - >>> len_data = len(data_x) - >>> alpha = 0.01 - >>> theta = np.array([0.1, 0.2]) - >>> run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta) - array([0.196, 0.343]) - """ - n = len_data - - prod = np.dot(theta, data_x.transpose()) - prod -= data_y.transpose() - sum_grad = np.dot(prod, data_x) - theta = theta - (alpha / n) * sum_grad + lines = response.text.strip().splitlines() + data = [line.split(",") for line in lines[1:]] # skip header + return np.array(data, dtype=float) + + +def run_steep_gradient_descent(data_x, data_y, alpha, theta): + """Perform one step of gradient descent.""" + n = data_x.shape[0] + predictions = data_x @ theta.T + errors = predictions.flatten() - data_y + gradient = (1 / n) * (errors @ data_x) + theta = theta - alpha * gradient return theta -def sum_of_square_error(data_x, data_y, len_data, theta): - """Return sum of square error for error calculation - :param data_x : contains our dataset - :param data_y : contains the output (result vector) - :param len_data : len of the dataset - :param theta : contains the feature vector - :return : sum of square error computed from given feature's - - Example: - >>> vc_x = np.array([[1.1], [2.1], [3.1]]) - >>> vc_y = np.array([1.2, 2.2, 3.2]) - >>> round(sum_of_square_error(vc_x, vc_y, 3, np.array([1])),3) - np.float64(0.005) - """ - prod = np.dot(theta, data_x.transpose()) - prod -= data_y.transpose() - sum_elem = np.sum(np.square(prod)) - error = sum_elem / (2 * len_data) - return error - - -def run_linear_regression(data_x, data_y): - """Implement Linear regression over the dataset - :param data_x : contains our dataset - :param data_y : contains the output (result vector) - :return : feature for line of best fit (Feature vector) - """ - iterations = 100000 - alpha = 0.0001550 - - no_features = data_x.shape[1] - len_data = data_x.shape[0] - 1 - - theta = np.zeros((1, no_features)) +def sum_of_square_error(data_x, data_y, theta): + """Compute mean squared error.""" + n = data_x.shape[0] + predictions = data_x @ theta.T + errors = predictions.flatten() - data_y + return np.sum(errors**2) / (2 * n) - for i in range(iterations): - theta = run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta) - error = sum_of_square_error(data_x, data_y, len_data, theta) - print(f"At Iteration {i + 1} - Error is {error:.5f}") +def run_linear_regression(data_x, data_y, iterations=100000, alpha=0.000155): + """Run gradient descent to learn parameters.""" + theta = np.zeros((1, data_x.shape[1])) + for i in range(iterations): + theta = run_steep_gradient_descent(data_x, data_y, alpha, theta) + error = sum_of_square_error(data_x, data_y, theta) + print(f"Iteration {i + 1}: Error = {error:.5f}") return theta def mean_absolute_error(predicted_y, original_y): - """Return sum of square error for error calculation - :param predicted_y : contains the output of prediction (result vector) - :param original_y : contains values of expected outcome - :return : mean absolute error computed from given feature's - - >>> predicted_y = [3, -0.5, 2, 7] - >>> original_y = [2.5, 0.0, 2, 8] - >>> mean_absolute_error(predicted_y, original_y) - 0.5 - """ - total = sum(abs(y - predicted_y[i]) for i, y in enumerate(original_y)) - return total / len(original_y) + """Compute MAE (fully vectorized).""" + predicted_y = np.array(predicted_y) + original_y = np.array(original_y) + return np.mean(np.abs(predicted_y - original_y)) def main(): - """Driver function""" data = collect_dataset() - - len_data = data.shape[0] - data_x = np.c_[np.ones(len_data), data[:, :-1]].astype(float) - data_y = data[:, -1].astype(float) + data_x = np.c_[np.ones(data.shape[0]), data[:, 0]] # Add bias term + data_y = data[:, 1] # Rating theta = run_linear_regression(data_x, data_y) - len_result = theta.shape[1] - print("Resultant Feature vector : ") - for i in range(len_result): - print(f"{theta[0, i]:.5f}") + print("Learned Parameters (theta):") + for val in theta[0]: + print(f"{val:.5f}") + + predictions = data_x @ theta.T + mae = mean_absolute_error(predictions.flatten(), data_y) + print(f"Mean Absolute Error: {mae:.5f}") -if __name__ == "__main__": - import doctest - doctest.testmod() +if __name__ == "__main__": main()