diff --git a/.ci/Dockerfile.build b/.ci/Dockerfile.build new file mode 100644 index 0000000..5d15c6c --- /dev/null +++ b/.ci/Dockerfile.build @@ -0,0 +1,6 @@ +# Build set_ambient +FROM python:3.7-alpine + +ENV LC_ALL=C + +RUN pip install databricks-cli requests pytest \ No newline at end of file diff --git a/.ci/evaluatenotebookruns.py b/.ci/evaluatenotebookruns.py new file mode 100644 index 0000000..e71fe26 --- /dev/null +++ b/.ci/evaluatenotebookruns.py @@ -0,0 +1,50 @@ +# evaluatenotebookruns.py +import unittest +import json +import glob +import os +import logging + +class TestJobOutput(unittest.TestCase): + + test_output_path = '#ENV#' + + # def test_performance(self): + # path = self.test_output_path + # statuses = [] + # + # for filename in glob.glob(os.path.join(path, '*.json')): + # print('Evaluating: ' + filename) + # data = json.load(open(filename)) + # duration = data['execution_duration'] + # if duration > 100000: + # status = 'FAILED' + # else: + # status = 'SUCCESS' + # + # statuses.append(status) + # + # self.assertFalse('FAILED' in statuses) + + + def test_job_run(self): + path = self.test_output_path + statuses = [] + + + for filename in glob.glob(os.path.join(path, '*.json')): + logging.info('Evaluating: ' + filename) + print('Evaluating: ' + filename) + data = json.load(open(filename)) + print(data) + if data['state']['life_cycle_state'] == "RUNNING": + statuses.append('NOT_COMPLETED') + else: + status = data['state']['result_state'] + statuses.append(status) + + self.assertFalse('FAILED' in statuses) + self.assertFalse('NOT_COMPLETED' in statuses) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/.ci/executenotebook.py b/.ci/executenotebook.py new file mode 100644 index 0000000..7fc5766 --- /dev/null +++ b/.ci/executenotebook.py @@ -0,0 +1,116 @@ +# executenotebook.py +#!/usr/bin/python3 +import json +import requests +import os +import sys +import getopt +import time +import logging + + +def main(): + workspace = '' + token = '' + clusterid = '' + localpath = '' + workspacepath = '' + outfilepath = '' + ignore = '' + + try: + opts, args = getopt.getopt(sys.argv[1:], 'hs:t:c:lwo', + ['workspace=', 'token=', 'clusterid=', 'localpath=', 'workspacepath=', 'outfilepath=', 'ignore=']) + except getopt.GetoptError: + print( + 'executenotebook.py -s -t -c -l -w -o )') + sys.exit(2) + + for opt, arg in opts: + if opt == '-h': + print( + 'executenotebook.py -s -t -c -l -w -o ') + sys.exit() + elif opt in ('-s', '--workspace'): + workspace = arg + elif opt in ('-t', '--token'): + token = arg + elif opt in ('-c', '--clusterid'): + clusterid = arg + elif opt in ('-l', '--localpath'): + localpath = arg + elif opt in ('-w', '--workspacepath'): + workspacepath = arg + elif opt in ('-o', '--outfilepath'): + outfilepath = arg + elif opt in ('-i', '--ignore'): + ignore = arg + + print('-s is ' + workspace) + print('-t is ' + token) + print('-c is ' + clusterid) + print('-l is ' + localpath) + print('-w is ' + workspacepath) + print('-o is ' + outfilepath) + print('-i is ' + ignore) + # Generate array from walking local path + + ignore = ignore.split(',') + + notebooks = [] + for path, subdirs, files in os.walk(localpath): + for name in files: + if name in ignore: + logging.warning(f'Ignore ${name}') + continue + fullpath = path + '/' + name + # removes localpath to repo but keeps workspace path + fullworkspacepath = workspacepath + path.replace(localpath, '') + + name, file_extension = os.path.splitext(fullpath) + if file_extension.lower() in ['.ipynb']: + row = [fullpath, fullworkspacepath, 1] + notebooks.append(row) + + # run each element in list + for notebook in notebooks: + nameonly = os.path.basename(notebook[0]) + workspacepath = notebook[1] + + name, file_extension = os.path.splitext(nameonly) + + # workpath removes extension + fullworkspacepath = workspacepath + '/' + name + + print('Running job for:' + fullworkspacepath) + values = {'run_name': name, 'existing_cluster_id': clusterid, 'timeout_seconds': 3600, 'notebook_task': {'notebook_path': fullworkspacepath}} + + resp = requests.post(workspace + '/api/2.0/jobs/runs/submit', + data=json.dumps(values), auth=("token", token)) + runjson = resp.text + print("runjson:" + runjson) + d = json.loads(runjson) + runid = d['run_id'] + + i = 0 + waiting = True + while waiting: + time.sleep(20) + jobresp = requests.get(workspace + '/api/2.0/jobs/runs/get?run_id='+str(runid), + data=json.dumps(values), auth=("token", token)) + jobjson = jobresp.text + print("jobjson:" + jobjson) + j = json.loads(jobjson) + current_state = j['state']['life_cycle_state'] + runid = j['run_id'] + if current_state in ['TERMINATED', 'INTERNAL_ERROR', 'SKIPPED'] or i >= 24: + break + i = i + 1 + + if outfilepath != '': + file = open(outfilepath + '/' + str(runid) + '.json', 'w') + file.write(json.dumps(j)) + file.close() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000..2d5df80 --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,213 @@ +@Library('jenkinslib')_ + +cluster_id = "" +ocr_versions = "" +nlp_versions = "" +nlp_healthcare_versions = "" +databricks_versions = "" +nlp_version_prefix = "" + +def DBTOKEN = "DATABRICKS_TOKEN" +def DBURL = "https://dbc-6ca13d9d-74bb.cloud.databricks.com" +def SCRIPTPATH = "./.ci" +def NOTEBOOKPATH = "./databricks/python" +def WORKSPACEPATH = "/Shared/Spark OCR/tests" +def OUTFILEPATH = "." +def TESTRESULTPATH = "./reports/junit" +def IGNORE = "3. Compare CPU and GPU image processing with Spark OCR.ipynb" + +def SPARK_NLP_VERSION = params.nlp_version +def SPARK_NLP_HEALTHCARE_VERSION = params.nlp_healthcare_version +def SPARK_OCR_VERSION = params.ocr_version + +def PYPI_REPO_HEALTHCARE_SECRET = sparknlp_helpers.spark_nlp_healthcare_secret(SPARK_NLP_HEALTHCARE_VERSION) +def PYPI_REPO_OCR_SECRET = sparknlp_helpers.spark_ocr_secret(SPARK_OCR_VERSION) + +def DATABRICKS_RUNTIME_VERSION = params.databricks_runtime == null ? '7.3.x-scala2.12' : params.databricks_runtime.tokenize('|')[1] +def SPARK_VERSION = params.spark_version == null ? 'spark30' : params.spark_version + +switch(SPARK_VERSION) { +case 'spark24': + nlp_version_prefix="-spark24" + break +case 'spark23': + nlp_version_prefix="-spark23" + break +case 'spark30': + nlp_version_prefix="" + break +case 'spark32': + nlp_version_prefix="-spark32" +} + +def String get_releases(repo) +{ + def sparkOcrVesrionsString = sh(returnStdout: true, script: """gh api --paginate -H "Accept: application/vnd.github.v3+json" /repos/${repo}/releases""") + def sparkOcrVesrionsStringJson = readJSON text: sparkOcrVesrionsString + return sparkOcrVesrionsStringJson.collect{ it['tag_name']}.join("\n") +} + +node { + withCredentials([usernamePassword(credentialsId: '55e7e818-4ccf-4d23-b54c-fd97c21081ba', + usernameVariable: 'GITHUB_USER', + passwordVariable: 'GITHUB_TOKEN')]) { + ocr_versions = get_releases("johnsnowlabs/spark-ocr") + nlp_versions = get_releases("johnsnowlabs/spark-nlp") + nlp_healthcare_versions = get_releases("johnsnowlabs/spark-nlp-internal") + + } + withCredentials([string(credentialsId: DBTOKEN, variable: 'TOKEN')]) { + + def databricksVersionsString = sh(returnStdout: true, script:'curl --header "Authorization: Bearer $TOKEN" -X GET https://dbc-6ca13d9d-74bb.cloud.databricks.com/api/2.0/clusters/spark-versions') + def databricksVersionsStringJson = readJSON text: databricksVersionsString + databricks_versions = databricksVersionsStringJson['versions'].collect{ it['name'] + " |" + it['key']}.sort().join("\n") + } +} + +pipeline { + agent { + dockerfile { + filename '.ci/Dockerfile.build' + } + } + environment { + DATABRICKS_CONFIG_FILE = ".databricks.cfg" + GITHUB_CREDS = credentials('55e7e818-4ccf-4d23-b54c-fd97c21081ba') + } + parameters { + choice( + name:'databricks_runtime', + choices: '7.3 LTS Spark 3.0.1 |7.3.x-scala2.12\n' + databricks_versions, + description: 'Databricks runtime version' + ) + choice( + name:'ocr_version', + choices: ocr_versions, + description:'Spark Ocr version' + ) + choice( + name:'spark_version', + choices:'spark30\nspark32\nspark24\nspark23', + description:'define spark version' + ) + choice( + name:'nlp_version', + choices: nlp_versions, + description:'Spark Nlp version' + ) + choice( + name:'nlp_healthcare_version', + choices: nlp_healthcare_versions, + description:'Spark Nlp for Healthcare version' + ) + } + stages { + stage('Setup') { + steps { + script { + withCredentials([string(credentialsId: DBTOKEN, variable: 'TOKEN')]) { + sh('echo "${TOKEN}" > secret.txt') + sh("databricks configure --token-file secret.txt --host ${DBURL}") + } + } + } + } + stage('Copy notebooks to Databricks') { + steps { + script { + sh("databricks workspace import_dir -o '${NOTEBOOKPATH}' '${WORKSPACEPATH}'") + } + } + } + stage('Create Cluster') { + steps { + script { + withCredentials([string(credentialsId:'TEST_SPARK_NLP_LICENSE',variable:'SPARK_OCR_LICENSE'),[ + $class: 'AmazonWebServicesCredentialsBinding', + credentialsId: 'a4362e3b-808e-45e0-b7d2-1c62b0572df4', + accessKeyVariable: 'AWS_ACCESS_KEY_ID', + secretKeyVariable: 'AWS_SECRET_ACCESS_KEY']]) { + def jsonCluster = """ + { + "num_workers": 1, + "cluster_name": "Spark Ocr Notebook Test", + "spark_version": "${DATABRICKS_RUNTIME_VERSION}", + "spark_conf": { + "spark.sql.legacy.allowUntypedScalaUDF": "true" + }, + "aws_attributes": { + "first_on_demand": 1, + "availability": "SPOT_WITH_FALLBACK", + "zone_id": "us-west-2a", + "spot_bid_price_percent": 100, + "ebs_volume_count": 0 + }, + "node_type_id": "i3.xlarge", + "driver_node_type_id": "i3.xlarge", + "spark_env_vars": { + "JSL_OCR_LICENSE": "${SPARK_OCR_LICENSE}", + "AWS_ACCESS_KEY_ID": "${AWS_ACCESS_KEY_ID}", + "AWS_SECRET_ACCESS_KEY": "${AWS_SECRET_ACCESS_KEY}" + }, + "autotermination_minutes": 20 + } + """ + writeFile file: 'cluster.json', text: jsonCluster + def clusterRespString = sh(returnStdout: true, script: "databricks clusters create --json-file cluster.json") + def clusterRespJson = readJSON text: clusterRespString + cluster_id = clusterRespJson['cluster_id'] + sh "rm cluster.json" + } + } + } + } + stage('Install deps to Cluster') { + steps { + script { + sh("databricks libraries install --cluster-id ${cluster_id} --jar s3://pypi.johnsnowlabs.com/${PYPI_REPO_OCR_SECRET}/jars/spark-ocr-assembly-${SPARK_OCR_VERSION}-${SPARK_VERSION}.jar") + sh("databricks libraries install --cluster-id ${cluster_id} --jar s3://pypi.johnsnowlabs.com/${PYPI_REPO_HEALTHCARE_SECRET}/spark-nlp-jsl-${SPARK_NLP_HEALTHCARE_VERSION}${nlp_version_prefix}.jar") + sh("databricks libraries install --cluster-id ${cluster_id} --maven-coordinates com.johnsnowlabs.nlp:spark-nlp${nlp_version_prefix}_2.12:${SPARK_NLP_VERSION}") + sh("databricks libraries install --cluster-id ${cluster_id} --whl s3://pypi.johnsnowlabs.com/${PYPI_REPO_OCR_SECRET}/spark-ocr/spark_ocr-${SPARK_OCR_VERSION}+${SPARK_VERSION}-py3-none-any.whl") + sh("databricks libraries install --cluster-id ${cluster_id} --whl s3://pypi.johnsnowlabs.com/${PYPI_REPO_HEALTHCARE_SECRET}/spark-nlp-jsl/spark_nlp_jsl-${SPARK_NLP_VERSION}-py3-none-any.whl") + sh("databricks libraries install --cluster-id ${cluster_id} --pypi-package spark-nlp==${SPARK_NLP_VERSION}") + timeout(10) { + waitUntil { + script { + def respStringWaitLib = sh script: "databricks libraries cluster-status --cluster-id ${cluster_id}", returnStdout: true + def respJsonWaitLib = readJSON text: respStringWaitLib + return (respJsonWaitLib['library_statuses'].every{ it['status'] == 'INSTALLED'} ); + } + } + } + } + } + } + stage('Run Notebook Tests') { + steps { + script { + withCredentials([string(credentialsId: DBTOKEN, variable: 'TOKEN')]) { + sh """python3 $SCRIPTPATH/executenotebook.py --workspace=$DBURL\ + --token=$TOKEN\ + --clusterid=${cluster_id}\ + --localpath=${NOTEBOOKPATH}\ + --workspacepath='${WORKSPACEPATH}'\ + --outfilepath='${OUTFILEPATH}'\ + --ignore='${IGNORE}' + """ + sh """sed -i -e 's #ENV# ${OUTFILEPATH} g' ${SCRIPTPATH}/evaluatenotebookruns.py + python3 -m pytest -s --junit-xml=${TESTRESULTPATH}/TEST-notebookout.xml ${SCRIPTPATH}/evaluatenotebookruns.py + """ + + } + } + } + } + } + post { + always { + sh "databricks clusters permanent-delete --cluster-id ${cluster_id}" + sh "find ${OUTFILEPATH} -name '*.json' -exec rm {} +" + junit allowEmptyResults: true, testResults: "**/reports/junit/*.xml" + } + } +}