|  | 
| 29 | 29 |         sc.parallelize([(0L, "a b c d e spark", 1.0), | 
| 30 | 30 |                         (1L, "b d", 0.0), | 
| 31 | 31 |                         (2L, "spark f g h", 1.0), | 
| 32 |  | -                        (3L, "hadoop mapreduce", 0.0)]) \ | 
|  | 32 | +                        (3L, "hadoop mapreduce", 0.0)]) | 
| 33 | 33 |           .map(lambda x: Row(id=x[0], text=x[1], label=x[2]))) | 
| 34 | 34 | 
 | 
| 35 | 35 |     tokenizer = Tokenizer() \ | 
| 36 |  | -      .setInputCol("text") \ | 
| 37 |  | -      .setOutputCol("words") | 
|  | 36 | +        .setInputCol("text") \ | 
|  | 37 | +        .setOutputCol("words") | 
| 38 | 38 |     hashingTF = HashingTF() \ | 
| 39 |  | -      .setInputCol(tokenizer.getOutputCol()) \ | 
| 40 |  | -      .setOutputCol("features") | 
|  | 39 | +        .setInputCol(tokenizer.getOutputCol()) \ | 
|  | 40 | +        .setOutputCol("features") | 
| 41 | 41 |     lr = LogisticRegression() \ | 
| 42 |  | -      .setMaxIter(10) \ | 
| 43 |  | -      .setRegParam(0.01) | 
|  | 42 | +        .setMaxIter(10) \ | 
|  | 43 | +        .setRegParam(0.01) | 
| 44 | 44 |     pipeline = Pipeline() \ | 
| 45 |  | -      .setStages([tokenizer, hashingTF, lr]) | 
|  | 45 | +        .setStages([tokenizer, hashingTF, lr]) | 
| 46 | 46 | 
 | 
| 47 | 47 |     model = pipeline.fit(training) | 
| 48 | 48 | 
 | 
| 49 | 49 |     test = sqlCtx.inferSchema( | 
| 50 | 50 |         sc.parallelize([(4L, "spark i j k"), | 
| 51 | 51 |                         (5L, "l m n"), | 
| 52 | 52 |                         (6L, "mapreduce spark"), | 
| 53 |  | -                        (7L, "apache hadoop")]) \ | 
|  | 53 | +                        (7L, "apache hadoop")]) | 
| 54 | 54 |           .map(lambda x: Row(id=x[0], text=x[1]))) | 
| 55 | 55 | 
 | 
| 56 | 56 |     for row in model.transform(test).collect(): | 
|  | 
0 commit comments