Skip to content
This repository was archived by the owner on May 9, 2024. It is now read-only.

Commit a3dc618

Browse files
committed
[SPARK-5477] refactor stat.py
There is only a single `stat.py` file for the `mllib.stat` package. We recently added `MultivariateGaussian` under `mllib.stat.distribution` in Scala/Java. It would be nice to refactor `stat.py` and make it easy to expand. Note that `ChiSqTestResult` is moved from `mllib.stat` to `mllib.stat.test`. The latter is used in Scala/Java. It is only used in the return value of `Statistics.chiSqTest`, so this should be an okay change. davies Author: Xiangrui Meng <[email protected]> Closes apache#4266 from mengxr/py-stat-refactor and squashes the following commits: 1a5e1db [Xiangrui Meng] refactor stat.py
1 parent 5ad78f6 commit a3dc618

File tree

5 files changed

+97
-54
lines changed

5 files changed

+97
-54
lines changed

mllib/pom.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@
125125
<directory>../python</directory>
126126
<includes>
127127
<include>pyspark/mllib/*.py</include>
128+
<include>pyspark/mllib/stat/*.py</include>
128129
<include>pyspark/ml/*.py</include>
129130
<include>pyspark/ml/param/*.py</include>
130131
</includes>
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
"""
19+
Python package for statistical functions in MLlib.
20+
"""
21+
22+
from pyspark.mllib.stat._statistics import *
23+
24+
__all__ = ["Statistics", "MultivariateStatisticalSummary"]

python/pyspark/mllib/stat.py renamed to python/pyspark/mllib/stat/_statistics.py

Lines changed: 2 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,14 @@
1515
# limitations under the License.
1616
#
1717

18-
"""
19-
Python package for statistical functions in MLlib.
20-
"""
21-
2218
from pyspark import RDD
2319
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
2420
from pyspark.mllib.linalg import Matrix, _convert_to_vector
2521
from pyspark.mllib.regression import LabeledPoint
22+
from pyspark.mllib.stat.test import ChiSqTestResult
2623

2724

28-
__all__ = ['MultivariateStatisticalSummary', 'ChiSqTestResult', 'Statistics']
25+
__all__ = ['MultivariateStatisticalSummary', 'Statistics']
2926

3027

3128
class MultivariateStatisticalSummary(JavaModelWrapper):
@@ -53,54 +50,6 @@ def min(self):
5350
return self.call("min").toArray()
5451

5552

56-
class ChiSqTestResult(JavaModelWrapper):
57-
"""
58-
.. note:: Experimental
59-
60-
Object containing the test results for the chi-squared hypothesis test.
61-
"""
62-
@property
63-
def method(self):
64-
"""
65-
Name of the test method
66-
"""
67-
return self._java_model.method()
68-
69-
@property
70-
def pValue(self):
71-
"""
72-
The probability of obtaining a test statistic result at least as
73-
extreme as the one that was actually observed, assuming that the
74-
null hypothesis is true.
75-
"""
76-
return self._java_model.pValue()
77-
78-
@property
79-
def degreesOfFreedom(self):
80-
"""
81-
Returns the degree(s) of freedom of the hypothesis test.
82-
Return type should be Number(e.g. Int, Double) or tuples of Numbers.
83-
"""
84-
return self._java_model.degreesOfFreedom()
85-
86-
@property
87-
def statistic(self):
88-
"""
89-
Test statistic.
90-
"""
91-
return self._java_model.statistic()
92-
93-
@property
94-
def nullHypothesis(self):
95-
"""
96-
Null hypothesis of the test.
97-
"""
98-
return self._java_model.nullHypothesis()
99-
100-
def __str__(self):
101-
return self._java_model.toString()
102-
103-
10453
class Statistics(object):
10554

10655
@staticmethod

python/pyspark/mllib/stat/test.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
from pyspark.mllib.common import JavaModelWrapper
19+
20+
21+
__all__ = ["ChiSqTestResult"]
22+
23+
24+
class ChiSqTestResult(JavaModelWrapper):
25+
"""
26+
.. note:: Experimental
27+
28+
Object containing the test results for the chi-squared hypothesis test.
29+
"""
30+
@property
31+
def method(self):
32+
"""
33+
Name of the test method
34+
"""
35+
return self._java_model.method()
36+
37+
@property
38+
def pValue(self):
39+
"""
40+
The probability of obtaining a test statistic result at least as
41+
extreme as the one that was actually observed, assuming that the
42+
null hypothesis is true.
43+
"""
44+
return self._java_model.pValue()
45+
46+
@property
47+
def degreesOfFreedom(self):
48+
"""
49+
Returns the degree(s) of freedom of the hypothesis test.
50+
Return type should be Number(e.g. Int, Double) or tuples of Numbers.
51+
"""
52+
return self._java_model.degreesOfFreedom()
53+
54+
@property
55+
def statistic(self):
56+
"""
57+
Test statistic.
58+
"""
59+
return self._java_model.statistic()
60+
61+
@property
62+
def nullHypothesis(self):
63+
"""
64+
Null hypothesis of the test.
65+
"""
66+
return self._java_model.nullHypothesis()
67+
68+
def __str__(self):
69+
return self._java_model.toString()

python/run-tests

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ function run_mllib_tests() {
7676
run_test "pyspark/mllib/rand.py"
7777
run_test "pyspark/mllib/recommendation.py"
7878
run_test "pyspark/mllib/regression.py"
79-
run_test "pyspark/mllib/stat.py"
79+
run_test "pyspark/mllib/stat/_statistics.py"
8080
run_test "pyspark/mllib/tree.py"
8181
run_test "pyspark/mllib/util.py"
8282
run_test "pyspark/mllib/tests.py"

0 commit comments

Comments
 (0)