diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index a22e910004cacb..192b28c6131364 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -29,10 +29,10 @@ on: - 'main' jobs: - spark23: + spark32: if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" runs-on: macos-latest - name: Build and Test on Apache Spark 2.3.x + name: Build and Test on Apache Spark 3.2.x steps: - uses: actions/checkout@v2 @@ -48,25 +48,28 @@ jobs: - name: Install Python packages (Python 3.7) run: | python -m pip install --upgrade pip - pip install pyspark==2.3.4 numpy - - name: Build Spark NLP on Apache Spark 2.3.x + pip install pyspark==3.2.1 numpy + - name: Build Spark NLP on Apache Spark 3.2.1 run: | brew install sbt - sbt -Dis_spark23=true clean - sbt -Dis_spark23=true compile - sbt -mem 4096 -Dis_spark23=true assemblyAndCopy - - name: Test Spark NLP in Scala - Apache Spark 2.3.x + sbt -mem 4096 clean assemblyAndCopy + - name: Test Spark NLP in Scala - Apache Spark 3.2.x run: | - sbt -mem 4096 -Dis_spark23=true test - - name: Test Spark NLP in Python - Apache Spark 2.3.x + sbt -mem 4096 coverage test + - name: Upload coverage data to Coveralls + run: sbt coverageReport coveralls + env: + COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COVERALLS_FLAG_NAME: Apache Spark 3.2.x - Scala 2.12 + - name: Test Spark NLP in Python - Apache Spark 3.2.x run: | cd python python3.7 -m run-tests - spark24: + spark31: if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" runs-on: macos-latest - name: Build and Test on Apache Spark 2.4.x + name: Build and Test on Apache Spark 3.1.x steps: - uses: actions/checkout@v2 @@ -82,17 +85,15 @@ jobs: - name: Install Python packages (Python 3.7) run: | python -m pip install --upgrade pip - pip install pyspark==2.4.8 numpy - - name: Build Spark NLP on Apache Spark 2.4.x + pip install pyspark==3.1.3 numpy + - name: Build Spark NLP on Apache Spark 3.1.x run: | brew install sbt - sbt clean - sbt compile - sbt -mem 4096 -Dis_spark24=true assemblyAndCopy - - name: Test Spark NLP in Scala - Apache Spark 2.4.x + sbt -mem 4096 -Dis_spark30=true clean assemblyAndCopy + - name: Test Spark NLP in Scala - Apache Spark 3.1.x run: | - sbt -mem 4096 -Dis_spark24=true test - - name: Test Spark NLP in Python - Apache Spark 2.4.x + sbt -mem 4096 -Dis_spark30=true test + - name: Test Spark NLP in Python - Apache Spark 3.1.x run: | cd python python3.7 -m run-tests @@ -117,55 +118,11 @@ jobs: run: | python -m pip install --upgrade pip pip install pyspark==3.0.3 numpy - - name: Build Spark NLP on Apache Spark 3.0.3 + - name: Build Spark NLP on Apache Spark 3.0.x run: | brew install sbt - sbt clean - sbt compile - sbt -mem 4096 assemblyAndCopy - - name: Test Spark NLP in Scala - Apache Spark 3.0.x - run: | - sbt -mem 4096 test + sbt -mem 4096 -Dis_spark30=true clean assemblyAndCopy - name: Test Spark NLP in Python - Apache Spark 3.0.x run: | cd python - python3.7 -m run-tests - - spark32: - if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" - runs-on: macos-latest - name: Build and Test on Apache Spark 3.2.x - - steps: - - uses: actions/checkout@v2 - - name: Set up JDK 8 - uses: actions/setup-java@v1 - with: - java-version: 1.8 - - name: Install Python 3.7 - uses: actions/setup-python@v2 - with: - python-version: 3.7.7 - architecture: x64 - - name: Install Python packages (Python 3.7) - run: | - python -m pip install --upgrade pip - pip install pyspark==3.2.1 numpy - - name: Build Spark NLP on Apache Spark 3.2.1 - run: | - brew install sbt - sbt clean - sbt compile - sbt -mem 4096 -Dis_spark32=true assemblyAndCopy - - name: Test Spark NLP in Scala - Apache Spark 3.2.x - run: | - sbt -mem 4096 -Dis_spark32=true coverage test - - name: Upload coverage data to Coveralls - run: sbt coverageReport coveralls - env: - COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} - COVERALLS_FLAG_NAME: Apache Spark 3.2.x - Scala 2.12 - - name: Test Spark NLP in Python - Apache Spark 3.2.x - run: | - cd python - python3.7 -m run-tests + python3.7 -m run-tests \ No newline at end of file diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml index 77396e8091a9d0..4feef2ebfeedb8 100644 --- a/.github/workflows/stale.yaml +++ b/.github/workflows/stale.yaml @@ -12,7 +12,7 @@ jobs: id: stale with: stale-issue-message: 'This issue is stale because it has been open 120 days with no activity. Remove stale label or comment or this will be closed in 5 days' - days-before-stale: 365 + days-before-stale: 120 days-before-close: 14 exempt-issue-labels: 'backlog,bug,ob-hold,keep' - name: Print outputs diff --git a/build.sbt b/build.sbt index f2927f75e84630..51d2cb6bed089a 100644 --- a/build.sbt +++ b/build.sbt @@ -2,7 +2,7 @@ import Dependencies._ import Resolvers.m2Resolvers import sbtassembly.MergeStrategy -name := getPackageName(is_spark23, is_spark24, is_spark32, is_gpu) +name := getPackageName(is_spark30, is_gpu) organization := "com.johnsnowlabs.nlp" @@ -217,8 +217,8 @@ inConfig(SlowTest)(Defaults.testTasks) (Test / publishArtifact) := true /** Copies the assembled jar to the pyspark/lib dir * */ -lazy val copyAssembledJar = taskKey[Unit]("Copy assembled jar to pyspark/lib") -lazy val copyAssembledJarForPyPi = taskKey[Unit]("Copy assembled jar to pyspark/sparknlp/lib") +lazy val copyAssembledJar = taskKey[Unit]("Copy assembled jar to python/lib") +lazy val copyAssembledJarForPyPi = taskKey[Unit]("Copy assembled jar to python/sparknlp/lib") copyAssembledJar := { val jarFilePath = (assembly / assemblyOutputPath).value diff --git a/project/Dependencies.scala b/project/Dependencies.scala index b3fda712ce0844..56481fc9bfabbc 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -3,56 +3,39 @@ import sbt._ object Dependencies { /** ------- Spark version start ------- */ - val spark23Ver = "2.3.4" - val spark24Ver = "2.4.8" - val spark30Ver = "3.0.3" + // Spark 3.0.x and 3.1.x are similar + val spark30Ver = "3.1.3" val spark32Ver = "3.2.1" val is_gpu: String = System.getProperty("is_gpu", "false") val is_opt: String = System.getProperty("is_opt", "false") - val is_spark23: String = System.getProperty("is_spark23", "false") - val is_spark24: String = System.getProperty("is_spark24", "false") val is_spark30: String = System.getProperty("is_spark30", "false") val is_spark32: String = System.getProperty("is_spark32", "false") - val sparkVer: String = getSparkVersion(is_spark23, is_spark24, is_spark32) + val sparkVer: String = getSparkVersion(is_spark30) /** ------- Spark version end ------- */ /** Package attributes */ - def getPackageName( - is_spark23: String, - is_spark24: String, - is_spark32: String, - is_gpu: String): String = { - if (is_gpu.equals("true") && is_spark23.equals("true")) { - "spark-nlp-gpu-spark23" - } else if (is_gpu.equals("true") && is_spark24.equals("true")) { - "spark-nlp-gpu-spark24" - } else if (is_gpu.equals("true") && is_spark32.equals("true")) { - "spark-nlp-gpu-spark32" - } else if (is_gpu.equals("true") && is_spark32.equals("false")) { + def getPackageName(is_spark30: String, is_gpu: String): String = { + if (is_gpu.equals("true") && is_spark30.equals("true")) { + "spark-nlp-gpu-spark30" + } else if (is_gpu.equals("true") && is_spark30.equals("false")) { "spark-nlp-gpu" - } else if (is_gpu.equals("false") && is_spark23.equals("true")) { - "spark-nlp-spark23" - } else if (is_gpu.equals("false") && is_spark24.equals("true")) { - "spark-nlp-spark24" - } else if (is_gpu.equals("false") && is_spark32.equals("true")) { - "spark-nlp-spark32" + } else if (is_gpu.equals("false") && is_spark30.equals("true")) { + "spark-nlp-spark30" } else { "spark-nlp" } } - def getSparkVersion(is_spark23: String, is_spark24: String, is_spark32: String): String = { - if (is_spark24 == "true") spark24Ver - else if (is_spark23 == "true") spark23Ver - else if (is_spark32 == "true") spark32Ver - else spark30Ver + def getSparkVersion(is_spark30: String): String = { + if (is_spark30 == "true") spark30Ver + else spark32Ver } - def getJavaTarget(is_spark23: String, is_spark24: String): String = { - if (is_spark24.equals("true") || is_spark23.equals("true")) { + def getJavaTarget(is_spark30: String, is_spark32: String): String = { + if (is_spark30.equals("true") || is_spark32.equals("true")) { "-target:jvm-1.8" } else { "" @@ -60,12 +43,10 @@ object Dependencies { } /** ------- Scala version start ------- */ - lazy val scala211 = "2.11.12" lazy val scala212 = "2.12.10" - lazy val scalaVer: String = - if (is_spark23 == "true" | is_spark24 == "true") scala211 else scala212 + lazy val scalaVer: String = scala212 - lazy val supportedScalaVersions: Seq[String] = List(scala212, scala211) + lazy val supportedScalaVersions: Seq[String] = List(scala212) val scalaTestVersion = "3.2.9" @@ -90,8 +71,7 @@ object Dependencies { val greexVersion = "1.0" val greex = "com.navigamez" % "greex" % greexVersion - val json4sVersion: String = if (is_spark32 == "true") "3.7.0-M11" else "3.5.3" - + val json4sVersion: String = if (is_spark30 == "true") "3.7.0-M5" else "3.7.0-M11" val json4s = "org.json4s" %% "json4s-ext" % json4sVersion val junitVersion = "4.13.2" diff --git a/python/sparknlp/__init__.py b/python/sparknlp/__init__.py index 2a24d91a2b8ff1..9c097f7f651cdc 100644 --- a/python/sparknlp/__init__.py +++ b/python/sparknlp/__init__.py @@ -59,9 +59,7 @@ def start(gpu=False, - spark23=False, - spark24=False, - spark32=False, + spark30=False, memory="16G", cache_folder="", log_folder="", @@ -74,11 +72,9 @@ def start(gpu=False, .. code-block:: python :param gpu: start Spark NLP with GPU - :param spark23: start Spark NLP on Apache Spark 2.3.x - :param spark24: start Spark NLP on Apache Spark 2.4.x - :param spark32: start Spark NLP on Apache Spark 3.2.x + :param spark30: start Spark NLP on Apache Spark 3.0.x or 3.1.x :param memory: set driver memory for SparkSession - :param cache_folder: The location to download and exctract pretrained Models and Pipelines + :param cache_folder: The location to download and extract pretrained Models and Pipelines :param log_folder: The location to save logs from annotators during training such as NerDLApproach, ClassifierDLApproach, SentimentDLApproach, MultiClassifierDLApproach, etc. :param cluster_tmp_dir: The location to use on a cluster for temporarily files @@ -101,16 +97,12 @@ def start(gpu=False, ---------- gpu : bool, optional Whether to enable GPU acceleration (must be set up correctly), by default False - spark23 : bool, optional - Whether to use the Spark 2.3.x version of Spark NLP, by default False - spark24 : bool, optional - Whether to use the Spark 2.4.x version of Spark NLP, by default False - spark32 : bool, optional - Whether to use the Spark 3.2.x version of Spark NLP, by default False + spark30 : bool, optional + Whether to use the Spark 3.0.x or 3.1.x version of Spark NLP, by default False memory : str, optional How much memory to allocate for the Spark driver, by default "16G" real_time_output : bool, optional - Whether to ouput in real time, by default False + Whether to output in real time, by default False output_level : int, optional Output level for logs, by default 1 @@ -129,17 +121,11 @@ def __init__(self): self.serializer, self.serializer_max_buffer = "org.apache.spark.serializer.KryoSerializer", "2000M" self.driver_max_result_size = "0" # Spark NLP on Apache Spark 3.2.x - self.maven_spark32 = "com.johnsnowlabs.nlp:spark-nlp-spark32_2.12:{}".format(current_version) - self.maven_gpu_spark32 = "com.johnsnowlabs.nlp:spark-nlp-gpu-spark32_2.12:{}".format(current_version) + self.maven_spark32 = "com.johnsnowlabs.nlp:spark-nlp_2.12:{}".format(current_version) + self.maven_gpu_spark32 = "com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:{}".format(current_version) # Spark NLP on Apache Spark 3.0.x/3.1.x - self.maven_spark = "com.johnsnowlabs.nlp:spark-nlp_2.12:{}".format(current_version) - self.maven_gpu_spark = "com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:{}".format(current_version) - # Spark NLP on Apache Spark 2.4.x - self.maven_spark24 = "com.johnsnowlabs.nlp:spark-nlp-spark24_2.11:{}".format(current_version) - self.maven_gpu_spark24 = "com.johnsnowlabs.nlp:spark-nlp-gpu-spark24_2.11:{}".format(current_version) - # Spark NLP on Apache Spark 2.3.x - self.maven_spark23 = "com.johnsnowlabs.nlp:spark-nlp-spark23_2.11:{}".format(current_version) - self.maven_gpu_spark23 = "com.johnsnowlabs.nlp:spark-nlp-gpu-spark23_2.11:{}".format(current_version) + self.maven_spark30 = "com.johnsnowlabs.nlp:spark-nlp-spark30_2.12:{}".format(current_version) + self.maven_gpu_spark30 = "com.johnsnowlabs.nlp:spark-nlp-gpu-spark30_2.12:{}".format(current_version) def start_without_realtime_output(): builder = SparkSession.builder \ @@ -150,22 +136,14 @@ def start_without_realtime_output(): .config("spark.kryoserializer.buffer.max", spark_nlp_config.serializer_max_buffer) \ .config("spark.driver.maxResultSize", spark_nlp_config.driver_max_result_size) - if gpu and spark23: - builder.config("spark.jars.packages", spark_nlp_config.maven_gpu_spark23) - elif gpu and spark24: - builder.config("spark.jars.packages", spark_nlp_config.maven_gpu_spark24) - elif gpu and spark32: - builder.config("spark.jars.packages", spark_nlp_config.maven_gpu_spark32) - elif spark23: - builder.config("spark.jars.packages", spark_nlp_config.maven_spark23) - elif spark24: - builder.config("spark.jars.packages", spark_nlp_config.maven_spark24) - elif spark32: - builder.config("spark.jars.packages", spark_nlp_config.maven_spark32) + if gpu and spark30: + builder.config("spark.jars.packages", spark_nlp_config.maven_gpu_spark30) + elif spark30: + builder.config("spark.jars.packages", spark_nlp_config.maven_spark30) elif gpu: - builder.config("spark.jars.packages", spark_nlp_config.maven_gpu_spark) + builder.config("spark.jars.packages", spark_nlp_config.maven_gpu_spark32) else: - builder.config("spark.jars.packages", spark_nlp_config.maven_spark) + builder.config("spark.jars.packages", spark_nlp_config.maven_spark32) if cache_folder != '': builder.config("spark.jsl.settings.pretrained.cache_folder", cache_folder) @@ -189,14 +167,14 @@ def __init__(self): spark_conf.set("spark.kryoserializer.buffer.max", spark_nlp_config.serializer_max_buffer) spark_conf.set("spark.driver.maxResultSize", spark_nlp_config.driver_max_result_size) - if spark32: - spark_conf.set("spark.jars.packages", spark_nlp_config.maven_spark32) - elif gpu and spark32: - spark_conf.set("spark.jars.packages", spark_nlp_config.maven_gpu_spark32) + if spark30: + spark_conf.set("spark.jars.packages", spark_nlp_config.maven_spark30) + elif gpu and spark30: + spark_conf.set("spark.jars.packages", spark_nlp_config.maven_gpu_spark30) elif gpu: - spark_conf.set("spark.jars.packages", spark_nlp_config.maven_gpu_spark) + spark_conf.set("spark.jars.packages", spark_nlp_config.maven_gpu_spark32) else: - spark_conf.set("spark.jars.packages", spark_nlp_config.maven_spark) + spark_conf.set("spark.jars.packages", spark_nlp_config.maven_spark32) if cache_folder != '': spark_conf.config("spark.jsl.settings.pretrained.cache_folder", cache_folder) @@ -254,21 +232,17 @@ def shutdown(self): spark_nlp_config = SparkNLPConfig() if real_time_output: - if spark23 or spark24: - spark_session = start_without_realtime_output() - return spark_session - else: - # Available from Spark 3.0.x - class SparkRealTimeOutput: + # Available from Spark 3.0.x + class SparkRealTimeOutput: - def __init__(self): - self.__spark_with_custom_gateway = start_with_realtime_output() - self.spark_session = self.__spark_with_custom_gateway.spark_session + def __init__(self): + self.__spark_with_custom_gateway = start_with_realtime_output() + self.spark_session = self.__spark_with_custom_gateway.spark_session - def shutdown(self): - self.__spark_with_custom_gateway.shutdown() + def shutdown(self): + self.__spark_with_custom_gateway.shutdown() - return SparkRealTimeOutput() + return SparkRealTimeOutput() else: spark_session = start_without_realtime_output() return spark_session diff --git a/scripts/colab_setup.sh b/scripts/colab_setup.sh index d0e014c7357856..3ba75bc70fb802 100644 --- a/scripts/colab_setup.sh +++ b/scripts/colab_setup.sh @@ -26,21 +26,6 @@ elif [[ "$PYSPARK" == "3.1"* ]]; then elif [[ "$PYSPARK" == "3.0"* ]]; then PYSPARK="3.0.3" echo "Installing PySpark $PYSPARK and Spark NLP $SPARKNLP" -elif [[ "$PYSPARK" == "2"* ]]; then - PYSPARK="2.4.8" - echo "Installing PySpark $PYSPARK and Spark NLP $SPARKNLP" - echo "Don't forget to use spark24=True inside sparknlp.start(spark24=True)" - apt-get update - apt-get purge -y openjdk-11* -qq > /dev/null && sudo apt-get autoremove -y -qq > /dev/null - apt-get install -y openjdk-8-jdk-headless -qq > /dev/null - - SPARKHOME="/content/spark-2.4.8-bin-hadoop2.7" - export SPARK_HOME=$SPARKHOME - export JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64" - - wget -q "https://downloads.apache.org/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz" > /dev/null - tar -xvf spark-2.4.8-bin-hadoop2.7.tgz > /dev/null - else PYSPARK="3.0.3" export JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64" diff --git a/scripts/kaggle_setup.sh b/scripts/kaggle_setup.sh index 57e9e311a28ffb..33b2fb478566d3 100644 --- a/scripts/kaggle_setup.sh +++ b/scripts/kaggle_setup.sh @@ -26,21 +26,6 @@ elif [[ "$PYSPARK" == "3.1"* ]]; then elif [[ "$PYSPARK" == "3.0"* ]]; then PYSPARK="3.0.3" echo "Installing PySpark $PYSPARK and Spark NLP $SPARKNLP" -elif [[ "$PYSPARK" == "2"* ]]; then - PYSPARK="2.4.8" - echo "Installing PySpark $PYSPARK and Spark NLP $SPARKNLP" - echo "Don't forget to use spark24=True inside sparknlp.start(spark24=True)" - apt-get update - apt-get purge -y openjdk-11* -qq > /dev/null && sudo apt-get autoremove -y -qq > /dev/null - apt-get install -y openjdk-8-jdk-headless -qq > /dev/null - - SPARKHOME="/content/spark-2.4.8-bin-hadoop2.7" - export SPARK_HOME=$SPARKHOME - export JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64" - - wget -q "https://downloads.apache.org/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz" > /dev/null - tar -xvf spark-2.4.8-bin-hadoop2.8.tgz > /dev/null - else PYSPARK="3.0.3" echo "Installing PySpark $PYSPARK and Spark NLP $SPARKNLP" diff --git a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala index 306cd2076328cd..3ae4bd3f2bc1e3 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala @@ -21,29 +21,21 @@ import org.apache.spark.sql.SparkSession object SparkNLP { val currentVersion = "3.4.4" - val MavenSpark32 = s"com.johnsnowlabs.nlp:spark-nlp-spark32_2.12:$currentVersion" - val MavenGpuSpark32 = s"com.johnsnowlabs.nlp:spark-nlp-gpu-spark32_2.12:$currentVersion" - val MavenSpark30 = s"com.johnsnowlabs.nlp:spark-nlp_2.12:$currentVersion" - val MavenGpuSpark30 = s"com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:$currentVersion" - val MavenSpark24 = s"com.johnsnowlabs.nlp:spark-nlp-spark24_2.11:$currentVersion" - val MavenGpuSpark24 = s"com.johnsnowlabs.nlp:spark-nlp-gpu-spark24_2.11:$currentVersion" - val MavenSpark23 = s"com.johnsnowlabs.nlp:spark-nlp-spark23_2.11:$currentVersion" - val MavenGpuSpark23 = s"com.johnsnowlabs.nlp:spark-nlp-gpu-spark23_2.11:$currentVersion" + val MavenSpark32 = s"com.johnsnowlabs.nlp:spark-nlp_2.12:$currentVersion" + val MavenGpuSpark32 = s"com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:$currentVersion" + val MavenSpark30 = s"com.johnsnowlabs.nlp:spark-nlp-spark30_2.12:$currentVersion" + val MavenGpuSpark30 = s"com.johnsnowlabs.nlp:spark-nlp-gpu-spark30_2.12:$currentVersion" /** Start SparkSession with Spark NLP * * @param gpu * start Spark NLP with GPU - * @param spark23 - * start Spark NLP on Apache Spark 2.3.x - * @param spark24 - * start Spark NLP on Apache Spark 2.4.x - * @param spark32 + * @param spark30 * start Spark NLP on Apache Spark 3.2.x * @param memory * set driver memory for SparkSession * @param cache_folder - * The location to download and exctract pretrained Models and Pipelines + * The location to download and extract pretrained Models and Pipelines * @param log_folder * The location to save logs from annotators during training such as NerDLApproach, * ClassifierDLApproach, SentimentDLApproach, MultiClassifierDLApproach, etc. @@ -55,9 +47,7 @@ object SparkNLP { */ def start( gpu: Boolean = false, - spark23: Boolean = false, - spark24: Boolean = false, - spark32: Boolean = false, + spark30: Boolean = false, memory: String = "16G", cache_folder: String = "", log_folder: String = "", @@ -72,22 +62,14 @@ object SparkNLP { .config("spark.kryoserializer.buffer.max", "2000M") .config("spark.driver.maxResultSize", "0") - if (gpu & spark23) { - build.config("spark.jars.packages", MavenGpuSpark23) - } else if (gpu & spark24) { - build.config("spark.jars.packages", MavenGpuSpark24) - } else if (gpu & spark32) { - build.config("spark.jars.packages", MavenGpuSpark32) - } else if (spark23) { - build.config("spark.jars.packages", MavenSpark23) - } else if (spark24) { - build.config("spark.jars.packages", MavenSpark24) - } else if (spark24) { - build.config("spark.jars.packages", MavenSpark32) - } else if (gpu) { + if (gpu & spark30) { build.config("spark.jars.packages", MavenGpuSpark30) - } else { + } else if (spark30) { build.config("spark.jars.packages", MavenSpark30) + } else if (gpu) { + build.config("spark.jars.packages", MavenGpuSpark32) + } else { + build.config("spark.jars.packages", MavenSpark32) } if (cache_folder.nonEmpty) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/er/EntityRulerApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/er/EntityRulerApproach.scala index 3d26f93f14c41b..1f0a81b5a6ff18 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/er/EntityRulerApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/er/EntityRulerApproach.scala @@ -454,10 +454,6 @@ class EntityRulerApproach(override val uid: String) val spark = patternsDataFrame.sparkSession val sparkVersion = Version.parse(spark.version).toFloat - // TODO: drop this feature and the UDF when we deprecated Spark 2.3 support - if (sparkVersion < 2.4) { - spark.udf.register("flatten", SparkUtil.flattenArrays) - } if (fieldId.nonEmpty) { val patternsWithIdDataFrame = diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala index b40131085c1857..ec57abc4cbfca7 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala @@ -90,9 +90,7 @@ object ResourceDownloader { private val cache = mutable.Map[ResourceRequest, PipelineStage]() lazy val sparkVersion: Version = { - val spark_version = - if (ResourceHelper.spark.version.startsWith("2.3")) "2.4.4" - else ResourceHelper.spark.version + val spark_version = ResourceHelper.spark.version Version.parse(spark_version) } diff --git a/src/test/java/com/johnsnowlabs/nlp/GeneralAnnotationsTest.java b/src/test/java/com/johnsnowlabs/nlp/GeneralAnnotationsTest.java index 0e357141c28755..c749670cc4f835 100644 --- a/src/test/java/com/johnsnowlabs/nlp/GeneralAnnotationsTest.java +++ b/src/test/java/com/johnsnowlabs/nlp/GeneralAnnotationsTest.java @@ -47,8 +47,6 @@ public static void main(String[] args) { pipeline.setStages(new PipelineStage[]{document, tokenizer}); SparkSession spark = com.johnsnowlabs.nlp.SparkNLP.start( - false, - false, false, false, "16G", diff --git a/src/test/scala/com/johnsnowlabs/nlp/pretrained/CloudTestResources.scala b/src/test/scala/com/johnsnowlabs/nlp/pretrained/CloudTestResources.scala index bec976d95cd0ac..7f5acf42571f9a 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/pretrained/CloudTestResources.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/pretrained/CloudTestResources.scala @@ -51,13 +51,7 @@ object CloudTestResources { new ResourceMetadata("name", Some("en"), None, None, false, new Timestamp(1)) val name_de = new ResourceMetadata("name", Some("de"), None, None, true, new Timestamp(1)) - val name_en_251_23 = new ResourceMetadata( - "name", - Some("en"), - Some(Version(2, 5, 1)), - Some(Version(2, 3)), - true, - new Timestamp(1)) + val name_en_300_30 = new ResourceMetadata( "name", Some("en"), @@ -65,6 +59,7 @@ object CloudTestResources { Some(Version(3, 0)), true, new Timestamp(1)) + val bert_tiny_en_300_30 = new ResourceMetadata( "small_bert_L2_128", Some("en"), @@ -72,7 +67,8 @@ object CloudTestResources { Some(Version(3, 0)), true, new Timestamp(1)) - val all = List( + + val all: List[ResourceMetadata] = List( name_en_123_345_new, name_en_12_34_old, name_en_old, diff --git a/src/test/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloaderSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloaderSpec.scala index 3a939cab1a4abd..1556d51fe080e0 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloaderSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloaderSpec.scala @@ -80,20 +80,6 @@ class ResourceDownloaderSpec extends AnyFlatSpec { assert(found.get == b.name_en_old) } - "CloudResourceDownloader" should "allow download of model for 2.4 for 2.3 found resource" in { - val found = ResourceMetadata.resolveResource( - List(b.name_en_251_23), - ResourceRequest("name", Some("en"), "", Version(2, 5, 1), Version(2, 4, 4))) - assert(found.isDefined) - } - - "CloudResourceDownloader" should "not allow download of model for 3 for 2.3 found resource" in { - val found = ResourceMetadata.resolveResource( - List(b.name_en_251_23), - ResourceRequest("name", Some("en"), "", Version(2, 5, 1), Version(3))) - assert(found.isEmpty) - } - "CloudResourceDownloader" should "allow download of model for 3.0.x on spark 3.x found resource" in { val found = ResourceMetadata.resolveResource( List(b.name_en_300_30), diff --git a/src/test/scala/com/johnsnowlabs/nlp/pretrained/ResourceMedataTest.scala b/src/test/scala/com/johnsnowlabs/nlp/pretrained/ResourceMedataTest.scala index 0a1408bececa2e..5035fed077bf1c 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/pretrained/ResourceMedataTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/pretrained/ResourceMedataTest.scala @@ -110,23 +110,6 @@ class ResourceMedataTest extends AnyFlatSpec { assert(versions.get.libVersion.get == expectedSparkNLPVersion) } - it should "get model with spark==2.4 and spark-nlp==2.4.0 when spark==2.4 and spark-nlp==2.4.5" in { - val resourcePath = "src/test/resources/resource-downloader/test_example1.json" - val mockResourceDownloader: MockResourceDownloader = new MockResourceDownloader(resourcePath) - val resourceMetadata = mockResourceDownloader.resources - val resourceRequest = ResourceRequest( - "bert_base_cased", - libVersion = Version(List(2, 4, 5)), - sparkVersion = Version(List(2, 4))) - val expectedSparkNLPVersion = Version(List(2, 4, 0)) - val expectedSparkVersion = Version(List(2, 4)) - - val versions = ResourceMetadata.resolveResource(resourceMetadata, resourceRequest) - - assert(versions.get.sparkVersion.get == expectedSparkVersion) - assert(versions.get.libVersion.get == expectedSparkNLPVersion) - } - it should "get model with spark==3.0 and sparknlp==2.4.5 when spark==3.0 and spark-nlp==2.4.5" in { val resourcePath = "src/test/resources/resource-downloader/test_example1.json" val mockResourceDownloader: MockResourceDownloader = new MockResourceDownloader(resourcePath) @@ -144,23 +127,6 @@ class ResourceMedataTest extends AnyFlatSpec { assert(versions.get.libVersion.get == expectedSparkNLPVersion) } - it should "get model with spark==2.4 and spark-nlp==3.3.0 when spark==2.4 and spark-nlp==3.3.0" in { - val resourcePath = "src/test/resources/resource-downloader/test_models_same_time.json" - val mockResourceDownloader: MockResourceDownloader = new MockResourceDownloader(resourcePath) - val resourceMetadata = mockResourceDownloader.resources - val resourceRequest = ResourceRequest( - "bert_base_cased", - libVersion = Version(List(3, 3, 0)), - sparkVersion = Version(List(2, 4))) - val expectedSparkNLPVersion = Version(List(3, 3, 0)) - val expectedSparkVersion = Version(List(2, 4)) - - val versions = ResourceMetadata.resolveResource(resourceMetadata, resourceRequest) - - assert(versions.get.sparkVersion.get == expectedSparkVersion) - assert(versions.get.libVersion.get == expectedSparkNLPVersion) - } - it should "get model with spark==3.0 and spark-nlp==3.3.0 when spark==3.0 and spark-nlp==3.3.0" in { val resourcePath = "src/test/resources/resource-downloader/test_models_same_time.json" val mockResourceDownloader: MockResourceDownloader = new MockResourceDownloader(resourcePath) @@ -196,23 +162,6 @@ class ResourceMedataTest extends AnyFlatSpec { } - it should "get model with spark==2.4 and spark-nlp==3.3.0 when spark==2.4 and spark-nlp==3.3.0 and newest model version is 3.0" in { - val resourcePath = "src/test/resources/resource-downloader/test_bert_v3_newest.json" - val mockResourceDownloader: MockResourceDownloader = new MockResourceDownloader(resourcePath) - val resourceMetadata = mockResourceDownloader.resources - val resourceRequest = ResourceRequest( - "bert_base_cased", - libVersion = Version(List(3, 3, 0)), - sparkVersion = Version(List(2, 4))) - val expectedSparkNLPVersion = Version(List(3, 3, 0)) - val expectedSparkVersion = Version(List(2, 4)) - - val versions = ResourceMetadata.resolveResource(resourceMetadata, resourceRequest) - - assert(versions.get.sparkVersion.get == expectedSparkVersion) - assert(versions.get.libVersion.get == expectedSparkNLPVersion) - } - it should "get model with spark==3.0 and spark-nlp==3.3.0 when spark==3.0 and spark-nlp==3.3.0 and newest model version is 2.4" in { val resourcePath = "src/test/resources/resource-downloader/test_bert_v2_newest.json" val mockResourceDownloader: MockResourceDownloader = new MockResourceDownloader(resourcePath) @@ -230,23 +179,6 @@ class ResourceMedataTest extends AnyFlatSpec { assert(versions.get.libVersion.get == expectedSparkNLPVersion) } - it should "get model with spark==2.4 and spark-nlp==3.3.0 when spark==2.4 and spark-nlp==3.3.0 and newest model version is 2.4" in { - val resourcePath = "src/test/resources/resource-downloader/test_bert_v2_newest.json" - val mockResourceDownloader: MockResourceDownloader = new MockResourceDownloader(resourcePath) - val resourceMetadata = mockResourceDownloader.resources - val resourceRequest = ResourceRequest( - "bert_base_cased", - libVersion = Version(List(3, 3, 0)), - sparkVersion = Version(List(2, 4))) - val expectedSparkNLPVersion = Version(List(3, 3, 0)) - val expectedSparkVersion = Version(List(2, 4)) - - val versions = ResourceMetadata.resolveResource(resourceMetadata, resourceRequest) - - assert(versions.get.sparkVersion.get == expectedSparkVersion) - assert(versions.get.libVersion.get == expectedSparkNLPVersion) - } - it should "get most recent model when spark and spark-nlp versions are the same" in { val resourcePath = "src/test/resources/resource-downloader/test_bert_v2_newest.json" val mockResourceDownloader: MockResourceDownloader = new MockResourceDownloader(resourcePath)