Skip to content

Performance reduced in 1.1.2 version #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
301 changes: 301 additions & 0 deletions jupyter/SparkOCRPdfToText_V1.0.0_Bench.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,301 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Benchmark Spark OCR Libs PdfToText V1.0.0\n",
"## Initialize spark session"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from sparkocr.enums import PageSegmentationMode\n",
"secret = \"\"\n",
"license = \"\"\n",
"version = \"1.0.0\"\n",
"spark_ocr_jar_path = \"../../target/scala-2.11\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"%%bash\n",
"if python -c 'import google.colab' &> /dev/null; then\n",
" echo \"Run on Google Colab!\"\n",
" echo \"Install Open JDK\"\n",
" apt-get install -y openjdk-8-jdk-headless -qq > /dev/null\n",
" java -version\n",
"fi"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"\n",
"if 'google.colab' in sys.modules:\n",
" os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n",
" os.environ[\"PATH\"] = os.environ[\"JAVA_HOME\"] + \"/bin:\" + os.environ[\"PATH\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# install from PYPI using secret\n",
"%pip install spark-ocr==$version --user --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialization of spark session"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SparkConf Configured, Starting to listen on port: 50980\n",
"JAR PATH:/usr/local/lib/python3.7/site-packages/sparkmonitor/listener.jar\n"
]
},
{
"data": {
"text/html": [
"\n",
" <div>\n",
" <p><b>SparkSession - in-memory</b></p>\n",
" \n",
" <div>\n",
" <p><b>SparkContext</b></p>\n",
"\n",
" <p><a href=\"http://melnyks-mbp:4040\">Spark UI</a></p>\n",
"\n",
" <dl>\n",
" <dt>Version</dt>\n",
" <dd><code>v2.4.4</code></dd>\n",
" <dt>Master</dt>\n",
" <dd><code>local[*]</code></dd>\n",
" <dt>AppName</dt>\n",
" <dd><code>Spark OCR</code></dd>\n",
" </dl>\n",
" </div>\n",
" \n",
" </div>\n",
" "
],
"text/plain": [
"<pyspark.sql.session.SparkSession at 0x122268d10>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from pyspark import SparkConf\n",
"from sparkocr import start\n",
"\n",
"if license:\n",
" os.environ['JSL_OCR_LICENSE'] = license\n",
"\n",
"conf = SparkConf()\n",
"spark = start(secret=secret, jar_path=spark_ocr_jar_path, extra_conf=conf)\n",
"spark"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from pyspark.ml import PipelineModel\n",
"from sparkocr.transformers import *"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define paths to pdf"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"binary_pdf = \"data/pdfs/test_doc_output.pdf\""
]
},
{
"cell_type": "markdown",
"source": [
"## Read pdf objects"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"18"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pdfs = spark.read.format(\"binaryFile\").load(binary_pdf).cache()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define OCR pipeline "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Run PdfToText OCR\n",
"pdf_to_text = PdfToText() \\\n",
" .setInputCol(\"content\") \\\n",
" .setOutputCol(\"text\") \\\n",
" .setSplitPage(False)\n",
"\n",
"pdf_to_image = PdfToImage() \\\n",
" .setOutputCol(\"image\") \\\n",
" .setFallBackCol(\"text\") \\\n",
" .setMinSizeBeforeFallback(10) \\\n",
" .setKeepInput(True) \\\n",
" .setResolution(200)\n",
"\n",
"skewCorrector = ImageSkewCorrector() \\\n",
" .setInputCol(\"image\") \\\n",
" .setOutputCol(\"deskewed_image\") \\\n",
" .setAutomaticSkewCorrection(True)\n",
"\n",
"layoutAnalyzer = ImageLayoutAnalyzer() \\\n",
" .setInputCol(\"deskewed_image\") \\\n",
" .setOutputCol(\"region\") \\\n",
" .setPageIteratorLevel(0) \\\n",
" .setPageSegMode(6)\n",
"\n",
"splitter = ImageSplitRegions() \\\n",
" .setInputCol(\"deskewed_image\") \\\n",
" .setInputRegionsCol(\"region\") \\\n",
" .setOutputCol(\"image_region\")\n",
"\n",
"ocr = TesseractOcr() \\\n",
" .setInputCol(\"deskewed_image\") \\\n",
" .setOutputCol(\"text\") \\\n",
" .setConfidenceThreshold(60) \\\n",
" .setLanguage(\"eng\") \\\n",
" .setIgnoreResolution(False) \\\n",
" .setPageSegMode(PageSegmentationMode.SPARSE_TEXT)\n",
"\n",
"# OCR pipeline\n",
"pipeline = PipelineModel(stages=[\n",
" pdf_to_text,\n",
" pdf_to_image,\n",
" skewCorrector,\n",
" layoutAnalyzer,\n",
" splitter,\n",
" ocr\n",
"\n",
"\n",
" ])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run OCR pipeline"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"%%time\n",
"results = pipeline.transform(pdfs)\n",
"pd=results.collect()\n",
"\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading