diff --git a/jupyter/SparkOCRPdfToText_V1.0.0_Bench.ipynb b/jupyter/SparkOCRPdfToText_V1.0.0_Bench.ipynb
new file mode 100644
index 0000000..d58be3b
--- /dev/null
+++ b/jupyter/SparkOCRPdfToText_V1.0.0_Bench.ipynb
@@ -0,0 +1,301 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Benchmark Spark OCR Libs PdfToText V1.0.0\n",
+ "## Initialize spark session"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sparkocr.enums import PageSegmentationMode\n",
+ "secret = \"\"\n",
+ "license = \"\"\n",
+ "version = \"1.0.0\"\n",
+ "spark_ocr_jar_path = \"../../target/scala-2.11\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%bash\n",
+ "if python -c 'import google.colab' &> /dev/null; then\n",
+ " echo \"Run on Google Colab!\"\n",
+ " echo \"Install Open JDK\"\n",
+ " apt-get install -y openjdk-8-jdk-headless -qq > /dev/null\n",
+ " java -version\n",
+ "fi"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import sys\n",
+ "\n",
+ "if 'google.colab' in sys.modules:\n",
+ " os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n",
+ " os.environ[\"PATH\"] = os.environ[\"JAVA_HOME\"] + \"/bin:\" + os.environ[\"PATH\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# install from PYPI using secret\n",
+ "%pip install spark-ocr==$version --user --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Initialization of spark session"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "SparkConf Configured, Starting to listen on port: 50980\n",
+ "JAR PATH:/usr/local/lib/python3.7/site-packages/sparkmonitor/listener.jar\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ "
SparkSession - in-memory
\n",
+ " \n",
+ "
\n",
+ "
SparkContext
\n",
+ "\n",
+ "
Spark UI
\n",
+ "\n",
+ "
\n",
+ " - Version
\n",
+ " v2.4.4
\n",
+ " - Master
\n",
+ " local[*]
\n",
+ " - AppName
\n",
+ " Spark OCR
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from pyspark import SparkConf\n",
+ "from sparkocr import start\n",
+ "\n",
+ "if license:\n",
+ " os.environ['JSL_OCR_LICENSE'] = license\n",
+ "\n",
+ "conf = SparkConf()\n",
+ "spark = start(secret=secret, jar_path=spark_ocr_jar_path, extra_conf=conf)\n",
+ "spark"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pyspark.ml import PipelineModel\n",
+ "from sparkocr.transformers import *"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Define paths to pdf"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "binary_pdf = \"data/pdfs/test_doc_output.pdf\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Read pdf objects"
+ ],
+ "metadata": {
+ "collapsed": false
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "18"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pdfs = spark.read.format(\"binaryFile\").load(binary_pdf).cache()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Define OCR pipeline "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "# Run PdfToText OCR\n",
+ "pdf_to_text = PdfToText() \\\n",
+ " .setInputCol(\"content\") \\\n",
+ " .setOutputCol(\"text\") \\\n",
+ " .setSplitPage(False)\n",
+ "\n",
+ "pdf_to_image = PdfToImage() \\\n",
+ " .setOutputCol(\"image\") \\\n",
+ " .setFallBackCol(\"text\") \\\n",
+ " .setMinSizeBeforeFallback(10) \\\n",
+ " .setKeepInput(True) \\\n",
+ " .setResolution(200)\n",
+ "\n",
+ "skewCorrector = ImageSkewCorrector() \\\n",
+ " .setInputCol(\"image\") \\\n",
+ " .setOutputCol(\"deskewed_image\") \\\n",
+ " .setAutomaticSkewCorrection(True)\n",
+ "\n",
+ "layoutAnalyzer = ImageLayoutAnalyzer() \\\n",
+ " .setInputCol(\"deskewed_image\") \\\n",
+ " .setOutputCol(\"region\") \\\n",
+ " .setPageIteratorLevel(0) \\\n",
+ " .setPageSegMode(6)\n",
+ "\n",
+ "splitter = ImageSplitRegions() \\\n",
+ " .setInputCol(\"deskewed_image\") \\\n",
+ " .setInputRegionsCol(\"region\") \\\n",
+ " .setOutputCol(\"image_region\")\n",
+ "\n",
+ "ocr = TesseractOcr() \\\n",
+ " .setInputCol(\"deskewed_image\") \\\n",
+ " .setOutputCol(\"text\") \\\n",
+ " .setConfidenceThreshold(60) \\\n",
+ " .setLanguage(\"eng\") \\\n",
+ " .setIgnoreResolution(False) \\\n",
+ " .setPageSegMode(PageSegmentationMode.SPARSE_TEXT)\n",
+ "\n",
+ "# OCR pipeline\n",
+ "pipeline = PipelineModel(stages=[\n",
+ " pdf_to_text,\n",
+ " pdf_to_image,\n",
+ " skewCorrector,\n",
+ " layoutAnalyzer,\n",
+ " splitter,\n",
+ " ocr\n",
+ "\n",
+ "\n",
+ " ])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Run OCR pipeline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "results = pipeline.transform(pdfs)\n",
+ "pd=results.collect()\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/jupyter/SparkOCRPdfToText_V1.1.2_Bench.ipynb b/jupyter/SparkOCRPdfToText_V1.1.2_Bench.ipynb
new file mode 100644
index 0000000..fed1976
--- /dev/null
+++ b/jupyter/SparkOCRPdfToText_V1.1.2_Bench.ipynb
@@ -0,0 +1,281 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Benchmark Spark OCR Libs PdfToText V1.1.2\n",
+ "## Initialize spark session"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sparkocr.enums import PageSegmentationMode\n",
+ "secret = \"\"\n",
+ "license = \"\"\n",
+ "version = \"1.1.2\"\n",
+ "spark_ocr_jar_path = \"../../target/scala-2.11\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%bash\n",
+ "if python -c 'import google.colab' &> /dev/null; then\n",
+ " echo \"Run on Google Colab!\"\n",
+ " echo \"Install Open JDK\"\n",
+ " apt-get install -y openjdk-8-jdk-headless -qq > /dev/null\n",
+ " java -version\n",
+ "fi"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import sys\n",
+ "\n",
+ "if 'google.colab' in sys.modules:\n",
+ " os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n",
+ " os.environ[\"PATH\"] = os.environ[\"JAVA_HOME\"] + \"/bin:\" + os.environ[\"PATH\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Looking in indexes: https://pypi.org/simple, https://pypi.johnsnowlabs.com/\n",
+ "Note: you may need to restart the kernel to use updated packages.\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "ERROR: Could not find a version that satisfies the requirement spark-ocr==1.1.2 (from versions: none)\n",
+ "ERROR: No matching distribution found for spark-ocr==1.1.2\n",
+ "WARNING: You are using pip version 20.0.2; however, version 20.1.1 is available.\n",
+ "You should consider upgrading via the 'c:\\users\\pc\\appdata\\local\\programs\\python\\python37\\python.exe -m pip install --upgrade pip' command.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# install from PYPI using secret\n",
+ "%pip install spark-ocr==$version --user --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Initialization of spark session"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "Exception",
+ "evalue": "File C:\\Users\\PC\\IdeaProjects\\target\\scala-2.11\\spark-ocr-assembly-1.3.0rc1.jar not exist. Please specify path where located spark-ocr-assembly-1.3.0rc1.jar",
+ "output_type": "error",
+ "traceback": [
+ "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
+ "\u001B[1;31mException\u001B[0m Traceback (most recent call last)",
+ "\u001B[1;32m\u001B[0m in \u001B[0;36m\u001B[1;34m\u001B[0m\n\u001B[0;32m 6\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 7\u001B[0m \u001B[0mconf\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mSparkConf\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 8\u001B[1;33m \u001B[0mspark\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mstart\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0msecret\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0msecret\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mjar_path\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mspark_ocr_jar_path\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mextra_conf\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mconf\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 9\u001B[0m \u001B[0mspark\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 10\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n",
+ "\u001B[1;32mc:\\users\\pc\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sparkocr\\__init__.py\u001B[0m in \u001B[0;36mstart\u001B[1;34m(secret, jar_path, extra_conf, nlp_version, nlp_internal)\u001B[0m\n\u001B[0;32m 54\u001B[0m \u001B[0mjars\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mos\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mpath\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mabspath\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mos\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mpath\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mjoin\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mjar_path\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mocr_jar\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 55\u001B[0m \u001B[1;32mif\u001B[0m \u001B[1;32mnot\u001B[0m \u001B[0mos\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mpath\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0misfile\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mjars\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m---> 56\u001B[1;33m \u001B[1;32mraise\u001B[0m \u001B[0mException\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;34m\"File {} not exist. Please specify path where located {}\"\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mformat\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mjars\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mocr_jar\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 57\u001B[0m \u001B[1;32melse\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 58\u001B[0m \u001B[1;32mraise\u001B[0m \u001B[0mException\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;34m\"Need to specify 'secret' or 'jar_path' param.\"\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
+ "\u001B[1;31mException\u001B[0m: File C:\\Users\\PC\\IdeaProjects\\target\\scala-2.11\\spark-ocr-assembly-1.3.0rc1.jar not exist. Please specify path where located spark-ocr-assembly-1.3.0rc1.jar"
+ ]
+ }
+ ],
+ "source": [
+ "from pyspark import SparkConf\n",
+ "from sparkocr import start\n",
+ "\n",
+ "if license:\n",
+ " os.environ['JSL_OCR_LICENSE'] = license\n",
+ "\n",
+ "conf = SparkConf()\n",
+ "spark = start(secret=secret, jar_path=spark_ocr_jar_path, extra_conf=conf)\n",
+ "spark"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pyspark.ml import PipelineModel\n",
+ "from sparkocr.transformers import *"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Define paths to pdf"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "binary_pdf = \"data/pdfs/test_doc_output.pdf\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Read pdf objects"
+ ],
+ "metadata": {
+ "collapsed": false
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pdfs = spark.read.format(\"binaryFile\").load(binary_pdf).cache()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Define OCR pipeline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "# Run PdfToText OCR\n",
+ "pdf_to_text = PdfToText() \\\n",
+ " .setInputCol(\"content\") \\\n",
+ " .setOutputCol(\"text\") \\\n",
+ " .setSplitPage(False)\n",
+ "\n",
+ "pdf_to_image = PdfToImage() \\\n",
+ " .setOutputCol(\"image\") \\\n",
+ " .setFallBackCol(\"text\") \\\n",
+ " .setMinSizeBeforeFallback(10) \\\n",
+ " .setKeepInput(True) \\\n",
+ " .setResolution(200)\n",
+ "\n",
+ "skewCorrector = ImageSkewCorrector() \\\n",
+ " .setInputCol(\"image\") \\\n",
+ " .setOutputCol(\"deskewed_image\") \\\n",
+ " .setAutomaticSkewCorrection(True)\n",
+ "\n",
+ "\n",
+ "layoutAnalyzer = ImageLayoutAnalyzer() \\\n",
+ " .setInputCol(\"deskewed_image\") \\\n",
+ " .setOutputCol(\"region\") \\\n",
+ " .setPageIteratorLevel(0) \\\n",
+ " .setPageSegMode(6)\n",
+ "\n",
+ "splitter = ImageSplitRegions() \\\n",
+ " .setInputCol(\"deskewed_image\") \\\n",
+ " .setInputRegionsCol(\"region\") \\\n",
+ " .setOutputCol(\"image_region\")\n",
+ "\n",
+ "ocr = TesseractOcr() \\\n",
+ " .setInputCol(\"deskewed_image\") \\\n",
+ " .setOutputCol(\"text\") \\\n",
+ " .setConfidenceThreshold(60) \\\n",
+ " .setLanguage(\"eng\") \\\n",
+ " .setIgnoreResolution(False) \\\n",
+ " .setPageSegMode(PageSegmentationMode.SPARSE_TEXT)\n",
+ "\n",
+ "# OCR pipeline\n",
+ "pipeline = PipelineModel(stages=[\n",
+ " pdf_to_text,\n",
+ " pdf_to_image,\n",
+ " skewCorrector,\n",
+ " layoutAnalyzer,\n",
+ " splitter,\n",
+ " ocr\n",
+ "\n",
+ "\n",
+ " ])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Run OCR pipeline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "results = pipeline.transform(pdfs)\n",
+ "pd=results.collect()\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/jupyter/data/pdfs/test_doc_output.pdf b/jupyter/data/pdfs/test_doc_output.pdf
new file mode 100644
index 0000000..ce3f904
Binary files /dev/null and b/jupyter/data/pdfs/test_doc_output.pdf differ