diff --git a/jupyter/SparkOCRPdfToText_V1.0.0_Bench.ipynb b/jupyter/SparkOCRPdfToText_V1.0.0_Bench.ipynb new file mode 100644 index 0000000..d58be3b --- /dev/null +++ b/jupyter/SparkOCRPdfToText_V1.0.0_Bench.ipynb @@ -0,0 +1,301 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Benchmark Spark OCR Libs PdfToText V1.0.0\n", + "## Initialize spark session" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from sparkocr.enums import PageSegmentationMode\n", + "secret = \"\"\n", + "license = \"\"\n", + "version = \"1.0.0\"\n", + "spark_ocr_jar_path = \"../../target/scala-2.11\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "if python -c 'import google.colab' &> /dev/null; then\n", + " echo \"Run on Google Colab!\"\n", + " echo \"Install Open JDK\"\n", + " apt-get install -y openjdk-8-jdk-headless -qq > /dev/null\n", + " java -version\n", + "fi" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "\n", + "if 'google.colab' in sys.modules:\n", + " os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n", + " os.environ[\"PATH\"] = os.environ[\"JAVA_HOME\"] + \"/bin:\" + os.environ[\"PATH\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# install from PYPI using secret\n", + "%pip install spark-ocr==$version --user --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialization of spark session" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SparkConf Configured, Starting to listen on port: 50980\n", + "JAR PATH:/usr/local/lib/python3.7/site-packages/sparkmonitor/listener.jar\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "

SparkSession - in-memory

\n", + " \n", + "
\n", + "

SparkContext

\n", + "\n", + "

Spark UI

\n", + "\n", + "
\n", + "
Version
\n", + "
v2.4.4
\n", + "
Master
\n", + "
local[*]
\n", + "
AppName
\n", + "
Spark OCR
\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pyspark import SparkConf\n", + "from sparkocr import start\n", + "\n", + "if license:\n", + " os.environ['JSL_OCR_LICENSE'] = license\n", + "\n", + "conf = SparkConf()\n", + "spark = start(secret=secret, jar_path=spark_ocr_jar_path, extra_conf=conf)\n", + "spark" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.ml import PipelineModel\n", + "from sparkocr.transformers import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define paths to pdf" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "binary_pdf = \"data/pdfs/test_doc_output.pdf\"" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Read pdf objects" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "18" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdfs = spark.read.format(\"binaryFile\").load(binary_pdf).cache()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define OCR pipeline " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Run PdfToText OCR\n", + "pdf_to_text = PdfToText() \\\n", + " .setInputCol(\"content\") \\\n", + " .setOutputCol(\"text\") \\\n", + " .setSplitPage(False)\n", + "\n", + "pdf_to_image = PdfToImage() \\\n", + " .setOutputCol(\"image\") \\\n", + " .setFallBackCol(\"text\") \\\n", + " .setMinSizeBeforeFallback(10) \\\n", + " .setKeepInput(True) \\\n", + " .setResolution(200)\n", + "\n", + "skewCorrector = ImageSkewCorrector() \\\n", + " .setInputCol(\"image\") \\\n", + " .setOutputCol(\"deskewed_image\") \\\n", + " .setAutomaticSkewCorrection(True)\n", + "\n", + "layoutAnalyzer = ImageLayoutAnalyzer() \\\n", + " .setInputCol(\"deskewed_image\") \\\n", + " .setOutputCol(\"region\") \\\n", + " .setPageIteratorLevel(0) \\\n", + " .setPageSegMode(6)\n", + "\n", + "splitter = ImageSplitRegions() \\\n", + " .setInputCol(\"deskewed_image\") \\\n", + " .setInputRegionsCol(\"region\") \\\n", + " .setOutputCol(\"image_region\")\n", + "\n", + "ocr = TesseractOcr() \\\n", + " .setInputCol(\"deskewed_image\") \\\n", + " .setOutputCol(\"text\") \\\n", + " .setConfidenceThreshold(60) \\\n", + " .setLanguage(\"eng\") \\\n", + " .setIgnoreResolution(False) \\\n", + " .setPageSegMode(PageSegmentationMode.SPARSE_TEXT)\n", + "\n", + "# OCR pipeline\n", + "pipeline = PipelineModel(stages=[\n", + " pdf_to_text,\n", + " pdf_to_image,\n", + " skewCorrector,\n", + " layoutAnalyzer,\n", + " splitter,\n", + " ocr\n", + "\n", + "\n", + " ])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run OCR pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "%%time\n", + "results = pipeline.transform(pdfs)\n", + "pd=results.collect()\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/jupyter/SparkOCRPdfToText_V1.1.2_Bench.ipynb b/jupyter/SparkOCRPdfToText_V1.1.2_Bench.ipynb new file mode 100644 index 0000000..fed1976 --- /dev/null +++ b/jupyter/SparkOCRPdfToText_V1.1.2_Bench.ipynb @@ -0,0 +1,281 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Benchmark Spark OCR Libs PdfToText V1.1.2\n", + "## Initialize spark session" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from sparkocr.enums import PageSegmentationMode\n", + "secret = \"\"\n", + "license = \"\"\n", + "version = \"1.1.2\"\n", + "spark_ocr_jar_path = \"../../target/scala-2.11\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "if python -c 'import google.colab' &> /dev/null; then\n", + " echo \"Run on Google Colab!\"\n", + " echo \"Install Open JDK\"\n", + " apt-get install -y openjdk-8-jdk-headless -qq > /dev/null\n", + " java -version\n", + "fi" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "\n", + "if 'google.colab' in sys.modules:\n", + " os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n", + " os.environ[\"PATH\"] = os.environ[\"JAVA_HOME\"] + \"/bin:\" + os.environ[\"PATH\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://pypi.johnsnowlabs.com/\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ERROR: Could not find a version that satisfies the requirement spark-ocr==1.1.2 (from versions: none)\n", + "ERROR: No matching distribution found for spark-ocr==1.1.2\n", + "WARNING: You are using pip version 20.0.2; however, version 20.1.1 is available.\n", + "You should consider upgrading via the 'c:\\users\\pc\\appdata\\local\\programs\\python\\python37\\python.exe -m pip install --upgrade pip' command.\n" + ] + } + ], + "source": [ + "# install from PYPI using secret\n", + "%pip install spark-ocr==$version --user --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialization of spark session" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "ename": "Exception", + "evalue": "File C:\\Users\\PC\\IdeaProjects\\target\\scala-2.11\\spark-ocr-assembly-1.3.0rc1.jar not exist. Please specify path where located spark-ocr-assembly-1.3.0rc1.jar", + "output_type": "error", + "traceback": [ + "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[1;31mException\u001B[0m Traceback (most recent call last)", + "\u001B[1;32m\u001B[0m in \u001B[0;36m\u001B[1;34m\u001B[0m\n\u001B[0;32m 6\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 7\u001B[0m \u001B[0mconf\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mSparkConf\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 8\u001B[1;33m \u001B[0mspark\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mstart\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0msecret\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0msecret\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mjar_path\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mspark_ocr_jar_path\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mextra_conf\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mconf\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 9\u001B[0m \u001B[0mspark\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 10\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", + "\u001B[1;32mc:\\users\\pc\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sparkocr\\__init__.py\u001B[0m in \u001B[0;36mstart\u001B[1;34m(secret, jar_path, extra_conf, nlp_version, nlp_internal)\u001B[0m\n\u001B[0;32m 54\u001B[0m \u001B[0mjars\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mos\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mpath\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mabspath\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mos\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mpath\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mjoin\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mjar_path\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mocr_jar\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 55\u001B[0m \u001B[1;32mif\u001B[0m \u001B[1;32mnot\u001B[0m \u001B[0mos\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mpath\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0misfile\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mjars\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m---> 56\u001B[1;33m \u001B[1;32mraise\u001B[0m \u001B[0mException\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;34m\"File {} not exist. Please specify path where located {}\"\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mformat\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mjars\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mocr_jar\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 57\u001B[0m \u001B[1;32melse\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 58\u001B[0m \u001B[1;32mraise\u001B[0m \u001B[0mException\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;34m\"Need to specify 'secret' or 'jar_path' param.\"\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n", + "\u001B[1;31mException\u001B[0m: File C:\\Users\\PC\\IdeaProjects\\target\\scala-2.11\\spark-ocr-assembly-1.3.0rc1.jar not exist. Please specify path where located spark-ocr-assembly-1.3.0rc1.jar" + ] + } + ], + "source": [ + "from pyspark import SparkConf\n", + "from sparkocr import start\n", + "\n", + "if license:\n", + " os.environ['JSL_OCR_LICENSE'] = license\n", + "\n", + "conf = SparkConf()\n", + "spark = start(secret=secret, jar_path=spark_ocr_jar_path, extra_conf=conf)\n", + "spark" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.ml import PipelineModel\n", + "from sparkocr.transformers import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define paths to pdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "binary_pdf = \"data/pdfs/test_doc_output.pdf\"" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Read pdf objects" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pdfs = spark.read.format(\"binaryFile\").load(binary_pdf).cache()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define OCR pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Run PdfToText OCR\n", + "pdf_to_text = PdfToText() \\\n", + " .setInputCol(\"content\") \\\n", + " .setOutputCol(\"text\") \\\n", + " .setSplitPage(False)\n", + "\n", + "pdf_to_image = PdfToImage() \\\n", + " .setOutputCol(\"image\") \\\n", + " .setFallBackCol(\"text\") \\\n", + " .setMinSizeBeforeFallback(10) \\\n", + " .setKeepInput(True) \\\n", + " .setResolution(200)\n", + "\n", + "skewCorrector = ImageSkewCorrector() \\\n", + " .setInputCol(\"image\") \\\n", + " .setOutputCol(\"deskewed_image\") \\\n", + " .setAutomaticSkewCorrection(True)\n", + "\n", + "\n", + "layoutAnalyzer = ImageLayoutAnalyzer() \\\n", + " .setInputCol(\"deskewed_image\") \\\n", + " .setOutputCol(\"region\") \\\n", + " .setPageIteratorLevel(0) \\\n", + " .setPageSegMode(6)\n", + "\n", + "splitter = ImageSplitRegions() \\\n", + " .setInputCol(\"deskewed_image\") \\\n", + " .setInputRegionsCol(\"region\") \\\n", + " .setOutputCol(\"image_region\")\n", + "\n", + "ocr = TesseractOcr() \\\n", + " .setInputCol(\"deskewed_image\") \\\n", + " .setOutputCol(\"text\") \\\n", + " .setConfidenceThreshold(60) \\\n", + " .setLanguage(\"eng\") \\\n", + " .setIgnoreResolution(False) \\\n", + " .setPageSegMode(PageSegmentationMode.SPARSE_TEXT)\n", + "\n", + "# OCR pipeline\n", + "pipeline = PipelineModel(stages=[\n", + " pdf_to_text,\n", + " pdf_to_image,\n", + " skewCorrector,\n", + " layoutAnalyzer,\n", + " splitter,\n", + " ocr\n", + "\n", + "\n", + " ])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run OCR pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "%%time\n", + "results = pipeline.transform(pdfs)\n", + "pd=results.collect()\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/jupyter/data/pdfs/test_doc_output.pdf b/jupyter/data/pdfs/test_doc_output.pdf new file mode 100644 index 0000000..ce3f904 Binary files /dev/null and b/jupyter/data/pdfs/test_doc_output.pdf differ