Skip to content

Commit 5e9541a

Browse files
shivaramFelix Cheung
authored andcommitted
[SPARK-20727] Skip tests that use Hadoop utils on CRAN Windows
## What changes were proposed in this pull request? This change skips tests that use the Hadoop libraries while running on CRAN check with Windows as the operating system. This is to handle cases where the Hadoop winutils binaries are missing on the target system. The skipped tests consist of 1. Tests that save, load a model in MLlib 2. Tests that save, load CSV, JSON and Parquet files in SQL 3. Hive tests ## How was this patch tested? Tested by running on a local windows VM with HADOOP_HOME unset. Also testing with https://win-builder.r-project.org Author: Shivaram Venkataraman <[email protected]> Closes #17966 from shivaram/sparkr-windows-cran. (cherry picked from commit d06610f) Signed-off-by: Felix Cheung <[email protected]>
1 parent ddc199e commit 5e9541a

File tree

8 files changed

+445
-381
lines changed

8 files changed

+445
-381
lines changed

R/pkg/R/utils.R

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -899,3 +899,19 @@ basenameSansExtFromUrl <- function(url) {
899899
isAtomicLengthOne <- function(x) {
900900
is.atomic(x) && length(x) == 1
901901
}
902+
903+
is_cran <- function() {
904+
!identical(Sys.getenv("NOT_CRAN"), "true")
905+
}
906+
907+
is_windows <- function() {
908+
.Platform$OS.type == "windows"
909+
}
910+
911+
hadoop_home_set <- function() {
912+
!identical(Sys.getenv("HADOOP_HOME"), "")
913+
}
914+
915+
not_cran_or_windows_with_hadoop <- function() {
916+
!is_cran() && (!is_windows() || hadoop_home_set())
917+
}

R/pkg/inst/tests/testthat/test_mllib_classification.R

Lines changed: 49 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -50,15 +50,17 @@ test_that("spark.svmLinear", {
5050
expect_equal(sort(as.list(take(select(prediction, "prediction"), 10))[[1]]), expected)
5151

5252
# Test model save and load
53-
modelPath <- tempfile(pattern = "spark-svm-linear", fileext = ".tmp")
54-
write.ml(model, modelPath)
55-
expect_error(write.ml(model, modelPath))
56-
write.ml(model, modelPath, overwrite = TRUE)
57-
model2 <- read.ml(modelPath)
58-
coefs <- summary(model)$coefficients
59-
coefs2 <- summary(model2)$coefficients
60-
expect_equal(coefs, coefs2)
61-
unlink(modelPath)
53+
if (not_cran_or_windows_with_hadoop()) {
54+
modelPath <- tempfile(pattern = "spark-svm-linear", fileext = ".tmp")
55+
write.ml(model, modelPath)
56+
expect_error(write.ml(model, modelPath))
57+
write.ml(model, modelPath, overwrite = TRUE)
58+
model2 <- read.ml(modelPath)
59+
coefs <- summary(model)$coefficients
60+
coefs2 <- summary(model2)$coefficients
61+
expect_equal(coefs, coefs2)
62+
unlink(modelPath)
63+
}
6264

6365
# Test prediction with numeric label
6466
label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
@@ -128,15 +130,17 @@ test_that("spark.logit", {
128130
expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1))
129131

130132
# Test model save and load
131-
modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp")
132-
write.ml(model, modelPath)
133-
expect_error(write.ml(model, modelPath))
134-
write.ml(model, modelPath, overwrite = TRUE)
135-
model2 <- read.ml(modelPath)
136-
coefs <- summary(model)$coefficients
137-
coefs2 <- summary(model2)$coefficients
138-
expect_equal(coefs, coefs2)
139-
unlink(modelPath)
133+
if (not_cran_or_windows_with_hadoop()) {
134+
modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp")
135+
write.ml(model, modelPath)
136+
expect_error(write.ml(model, modelPath))
137+
write.ml(model, modelPath, overwrite = TRUE)
138+
model2 <- read.ml(modelPath)
139+
coefs <- summary(model)$coefficients
140+
coefs2 <- summary(model2)$coefficients
141+
expect_equal(coefs, coefs2)
142+
unlink(modelPath)
143+
}
140144

141145
# R code to reproduce the result.
142146
# nolint start
@@ -243,19 +247,21 @@ test_that("spark.mlp", {
243247
expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "0.0", "0.0", "0.0", "0.0", "0.0"))
244248

245249
# Test model save/load
246-
modelPath <- tempfile(pattern = "spark-mlp", fileext = ".tmp")
247-
write.ml(model, modelPath)
248-
expect_error(write.ml(model, modelPath))
249-
write.ml(model, modelPath, overwrite = TRUE)
250-
model2 <- read.ml(modelPath)
251-
summary2 <- summary(model2)
252-
253-
expect_equal(summary2$numOfInputs, 4)
254-
expect_equal(summary2$numOfOutputs, 3)
255-
expect_equal(summary2$layers, c(4, 5, 4, 3))
256-
expect_equal(length(summary2$weights), 64)
257-
258-
unlink(modelPath)
250+
if (not_cran_or_windows_with_hadoop()) {
251+
modelPath <- tempfile(pattern = "spark-mlp", fileext = ".tmp")
252+
write.ml(model, modelPath)
253+
expect_error(write.ml(model, modelPath))
254+
write.ml(model, modelPath, overwrite = TRUE)
255+
model2 <- read.ml(modelPath)
256+
summary2 <- summary(model2)
257+
258+
expect_equal(summary2$numOfInputs, 4)
259+
expect_equal(summary2$numOfOutputs, 3)
260+
expect_equal(summary2$layers, c(4, 5, 4, 3))
261+
expect_equal(length(summary2$weights), 64)
262+
263+
unlink(modelPath)
264+
}
259265

260266
# Test default parameter
261267
model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3))
@@ -354,16 +360,18 @@ test_that("spark.naiveBayes", {
354360
"Yes", "Yes", "No", "No"))
355361

356362
# Test model save/load
357-
modelPath <- tempfile(pattern = "spark-naiveBayes", fileext = ".tmp")
358-
write.ml(m, modelPath)
359-
expect_error(write.ml(m, modelPath))
360-
write.ml(m, modelPath, overwrite = TRUE)
361-
m2 <- read.ml(modelPath)
362-
s2 <- summary(m2)
363-
expect_equal(s$apriori, s2$apriori)
364-
expect_equal(s$tables, s2$tables)
365-
366-
unlink(modelPath)
363+
if (not_cran_or_windows_with_hadoop()) {
364+
modelPath <- tempfile(pattern = "spark-naiveBayes", fileext = ".tmp")
365+
write.ml(m, modelPath)
366+
expect_error(write.ml(m, modelPath))
367+
write.ml(m, modelPath, overwrite = TRUE)
368+
m2 <- read.ml(modelPath)
369+
s2 <- summary(m2)
370+
expect_equal(s$apriori, s2$apriori)
371+
expect_equal(s$tables, s2$tables)
372+
373+
unlink(modelPath)
374+
}
367375

368376
# Test e1071::naiveBayes
369377
if (requireNamespace("e1071", quietly = TRUE)) {

R/pkg/inst/tests/testthat/test_mllib_clustering.R

Lines changed: 60 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -53,18 +53,20 @@ test_that("spark.bisectingKmeans", {
5353
c(0, 1, 2, 3))
5454

5555
# Test model save/load
56-
modelPath <- tempfile(pattern = "spark-bisectingkmeans", fileext = ".tmp")
57-
write.ml(model, modelPath)
58-
expect_error(write.ml(model, modelPath))
59-
write.ml(model, modelPath, overwrite = TRUE)
60-
model2 <- read.ml(modelPath)
61-
summary2 <- summary(model2)
62-
expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
63-
expect_equal(summary.model$coefficients, summary2$coefficients)
64-
expect_true(!summary.model$is.loaded)
65-
expect_true(summary2$is.loaded)
66-
67-
unlink(modelPath)
56+
if (not_cran_or_windows_with_hadoop()) {
57+
modelPath <- tempfile(pattern = "spark-bisectingkmeans", fileext = ".tmp")
58+
write.ml(model, modelPath)
59+
expect_error(write.ml(model, modelPath))
60+
write.ml(model, modelPath, overwrite = TRUE)
61+
model2 <- read.ml(modelPath)
62+
summary2 <- summary(model2)
63+
expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
64+
expect_equal(summary.model$coefficients, summary2$coefficients)
65+
expect_true(!summary.model$is.loaded)
66+
expect_true(summary2$is.loaded)
67+
68+
unlink(modelPath)
69+
}
6870
})
6971

7072
test_that("spark.gaussianMixture", {
@@ -125,18 +127,20 @@ test_that("spark.gaussianMixture", {
125127
expect_equal(p$prediction, c(0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1))
126128

127129
# Test model save/load
128-
modelPath <- tempfile(pattern = "spark-gaussianMixture", fileext = ".tmp")
129-
write.ml(model, modelPath)
130-
expect_error(write.ml(model, modelPath))
131-
write.ml(model, modelPath, overwrite = TRUE)
132-
model2 <- read.ml(modelPath)
133-
stats2 <- summary(model2)
134-
expect_equal(stats$lambda, stats2$lambda)
135-
expect_equal(unlist(stats$mu), unlist(stats2$mu))
136-
expect_equal(unlist(stats$sigma), unlist(stats2$sigma))
137-
expect_equal(unlist(stats$loglik), unlist(stats2$loglik))
138-
139-
unlink(modelPath)
130+
if (not_cran_or_windows_with_hadoop()) {
131+
modelPath <- tempfile(pattern = "spark-gaussianMixture", fileext = ".tmp")
132+
write.ml(model, modelPath)
133+
expect_error(write.ml(model, modelPath))
134+
write.ml(model, modelPath, overwrite = TRUE)
135+
model2 <- read.ml(modelPath)
136+
stats2 <- summary(model2)
137+
expect_equal(stats$lambda, stats2$lambda)
138+
expect_equal(unlist(stats$mu), unlist(stats2$mu))
139+
expect_equal(unlist(stats$sigma), unlist(stats2$sigma))
140+
expect_equal(unlist(stats$loglik), unlist(stats2$loglik))
141+
142+
unlink(modelPath)
143+
}
140144
})
141145

142146
test_that("spark.kmeans", {
@@ -171,18 +175,20 @@ test_that("spark.kmeans", {
171175
expect_true(class(summary.model$coefficients[1, ]) == "numeric")
172176

173177
# Test model save/load
174-
modelPath <- tempfile(pattern = "spark-kmeans", fileext = ".tmp")
175-
write.ml(model, modelPath)
176-
expect_error(write.ml(model, modelPath))
177-
write.ml(model, modelPath, overwrite = TRUE)
178-
model2 <- read.ml(modelPath)
179-
summary2 <- summary(model2)
180-
expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
181-
expect_equal(summary.model$coefficients, summary2$coefficients)
182-
expect_true(!summary.model$is.loaded)
183-
expect_true(summary2$is.loaded)
184-
185-
unlink(modelPath)
178+
if (not_cran_or_windows_with_hadoop()) {
179+
modelPath <- tempfile(pattern = "spark-kmeans", fileext = ".tmp")
180+
write.ml(model, modelPath)
181+
expect_error(write.ml(model, modelPath))
182+
write.ml(model, modelPath, overwrite = TRUE)
183+
model2 <- read.ml(modelPath)
184+
summary2 <- summary(model2)
185+
expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
186+
expect_equal(summary.model$coefficients, summary2$coefficients)
187+
expect_true(!summary.model$is.loaded)
188+
expect_true(summary2$is.loaded)
189+
190+
unlink(modelPath)
191+
}
186192

187193
# Test Kmeans on dataset that is sensitive to seed value
188194
col1 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
@@ -236,22 +242,24 @@ test_that("spark.lda with libsvm", {
236242
expect_true(logPrior <= 0 & !is.na(logPrior))
237243

238244
# Test model save/load
239-
modelPath <- tempfile(pattern = "spark-lda", fileext = ".tmp")
240-
write.ml(model, modelPath)
241-
expect_error(write.ml(model, modelPath))
242-
write.ml(model, modelPath, overwrite = TRUE)
243-
model2 <- read.ml(modelPath)
244-
stats2 <- summary(model2)
245-
246-
expect_true(stats2$isDistributed)
247-
expect_equal(logLikelihood, stats2$logLikelihood)
248-
expect_equal(logPerplexity, stats2$logPerplexity)
249-
expect_equal(vocabSize, stats2$vocabSize)
250-
expect_equal(vocabulary, stats2$vocabulary)
251-
expect_equal(trainingLogLikelihood, stats2$trainingLogLikelihood)
252-
expect_equal(logPrior, stats2$logPrior)
253-
254-
unlink(modelPath)
245+
if (not_cran_or_windows_with_hadoop()) {
246+
modelPath <- tempfile(pattern = "spark-lda", fileext = ".tmp")
247+
write.ml(model, modelPath)
248+
expect_error(write.ml(model, modelPath))
249+
write.ml(model, modelPath, overwrite = TRUE)
250+
model2 <- read.ml(modelPath)
251+
stats2 <- summary(model2)
252+
253+
expect_true(stats2$isDistributed)
254+
expect_equal(logLikelihood, stats2$logLikelihood)
255+
expect_equal(logPerplexity, stats2$logPerplexity)
256+
expect_equal(vocabSize, stats2$vocabSize)
257+
expect_equal(vocabulary, stats2$vocabulary)
258+
expect_equal(trainingLogLikelihood, stats2$trainingLogLikelihood)
259+
expect_equal(logPrior, stats2$logPrior)
260+
261+
unlink(modelPath)
262+
}
255263
})
256264

257265
test_that("spark.lda with text input", {

R/pkg/inst/tests/testthat/test_mllib_fpm.R

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,17 @@ test_that("spark.fpGrowth", {
6262

6363
expect_equivalent(expected_predictions, collect(predict(model, new_data)))
6464

65-
modelPath <- tempfile(pattern = "spark-fpm", fileext = ".tmp")
66-
write.ml(model, modelPath, overwrite = TRUE)
67-
loaded_model <- read.ml(modelPath)
65+
if (not_cran_or_windows_with_hadoop()) {
66+
modelPath <- tempfile(pattern = "spark-fpm", fileext = ".tmp")
67+
write.ml(model, modelPath, overwrite = TRUE)
68+
loaded_model <- read.ml(modelPath)
6869

69-
expect_equivalent(
70-
itemsets,
71-
collect(spark.freqItemsets(loaded_model)))
70+
expect_equivalent(
71+
itemsets,
72+
collect(spark.freqItemsets(loaded_model)))
7273

73-
unlink(modelPath)
74+
unlink(modelPath)
75+
}
7476

7577
model_without_numpartitions <- spark.fpGrowth(data, minSupport = 0.3, minConfidence = 0.8)
7678
expect_equal(

R/pkg/inst/tests/testthat/test_mllib_recommendation.R

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -37,29 +37,31 @@ test_that("spark.als", {
3737
tolerance = 1e-4)
3838

3939
# Test model save/load
40-
modelPath <- tempfile(pattern = "spark-als", fileext = ".tmp")
41-
write.ml(model, modelPath)
42-
expect_error(write.ml(model, modelPath))
43-
write.ml(model, modelPath, overwrite = TRUE)
44-
model2 <- read.ml(modelPath)
45-
stats2 <- summary(model2)
46-
expect_equal(stats2$rating, "score")
47-
userFactors <- collect(stats$userFactors)
48-
itemFactors <- collect(stats$itemFactors)
49-
userFactors2 <- collect(stats2$userFactors)
50-
itemFactors2 <- collect(stats2$itemFactors)
40+
if (not_cran_or_windows_with_hadoop()) {
41+
modelPath <- tempfile(pattern = "spark-als", fileext = ".tmp")
42+
write.ml(model, modelPath)
43+
expect_error(write.ml(model, modelPath))
44+
write.ml(model, modelPath, overwrite = TRUE)
45+
model2 <- read.ml(modelPath)
46+
stats2 <- summary(model2)
47+
expect_equal(stats2$rating, "score")
48+
userFactors <- collect(stats$userFactors)
49+
itemFactors <- collect(stats$itemFactors)
50+
userFactors2 <- collect(stats2$userFactors)
51+
itemFactors2 <- collect(stats2$itemFactors)
5152

52-
orderUser <- order(userFactors$id)
53-
orderUser2 <- order(userFactors2$id)
54-
expect_equal(userFactors$id[orderUser], userFactors2$id[orderUser2])
55-
expect_equal(userFactors$features[orderUser], userFactors2$features[orderUser2])
53+
orderUser <- order(userFactors$id)
54+
orderUser2 <- order(userFactors2$id)
55+
expect_equal(userFactors$id[orderUser], userFactors2$id[orderUser2])
56+
expect_equal(userFactors$features[orderUser], userFactors2$features[orderUser2])
5657

57-
orderItem <- order(itemFactors$id)
58-
orderItem2 <- order(itemFactors2$id)
59-
expect_equal(itemFactors$id[orderItem], itemFactors2$id[orderItem2])
60-
expect_equal(itemFactors$features[orderItem], itemFactors2$features[orderItem2])
58+
orderItem <- order(itemFactors$id)
59+
orderItem2 <- order(itemFactors2$id)
60+
expect_equal(itemFactors$id[orderItem], itemFactors2$id[orderItem2])
61+
expect_equal(itemFactors$features[orderItem], itemFactors2$features[orderItem2])
6162

62-
unlink(modelPath)
63+
unlink(modelPath)
64+
}
6365
})
6466

6567
sparkR.session.stop()

0 commit comments

Comments
 (0)