diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..11c996036 --- /dev/null +++ b/.gitignore @@ -0,0 +1,21 @@ +vendor +.bundle +.env +stage-item/*.xml +stage-item/*.zip + +# other repositories + +catalog/ +common/ +hathitrust_catalog_indexer/ +ht-pairtree/ +imgsrv-sample-data/ +imgsrv/ +lss_solr_configs/ +pt/ +sample-data/ +slip/ +ssd/ +logs/ +cache/ diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index aa7a80e81..000000000 --- a/Dockerfile +++ /dev/null @@ -1,219 +0,0 @@ -FROM debian:bullseye - -RUN sed -i 's/main.*/main contrib non-free/' /etc/apt/sources.list - -RUN apt-get update && apt-get install -y \ - perl \ - libxerces-c3.2 \ - libxerces-c3-dev \ - sqlite3 \ - file \ - libalgorithm-diff-xs-perl \ - libany-moose-perl \ - libapache-session-perl \ - libarchive-zip-perl \ - libclass-accessor-perl \ - libclass-c3-perl \ - libclass-data-accessor-perl \ - libclass-data-inheritable-perl \ - libclass-errorhandler-perl \ - libclass-load-perl \ - libcommon-sense-perl \ - libcompress-raw-zlib-perl \ - libconfig-auto-perl \ - libconfig-inifiles-perl \ - libconfig-tiny-perl \ - libcrypt-openssl-random-perl \ - libcrypt-openssl-rsa-perl \ - libcrypt-ssleay-perl \ - libdata-optlist-perl \ - libdata-page-perl \ - libdate-calc-perl \ - libdate-manip-perl \ - libdbd-mock-perl \ - libdbd-mysql-perl \ - libdbd-sqlite3-perl \ - libdevel-globaldestruction-perl \ - libdigest-sha-perl \ - libemail-date-format-perl \ - libencode-locale-perl \ - liberror-perl \ - libeval-closure-perl \ - libexcel-writer-xlsx-perl \ - libfcgi-perl \ - libfcgi-procmanager-perl \ - libfile-listing-perl \ - libfile-slurp-perl \ - libfilesys-df-perl \ - libgeo-ip-perl \ - libhtml-parser-perl \ - libhtml-tree-perl \ - libhttp-browserdetect-perl \ - libhttp-cookies-perl \ - libhttp-daemon-perl \ - libhttp-date-perl \ - libhttp-dav-perl \ - libhttp-message-perl \ - libhttp-negotiate-perl \ - libimage-exiftool-perl \ - libimage-info-perl \ - libimage-size-perl \ - libinline-perl \ - libio-html-perl \ - libio-socket-ssl-perl \ - libio-string-perl \ - libipc-run-perl \ - libjson-perl \ - libjson-pp-perl \ - libjson-xs-perl \ - liblist-compare-perl \ - liblist-moreutils-perl \ - liblog-log4perl-perl \ - liblwp-authen-oauth2-perl \ - liblwp-mediatypes-perl \ - libmail-sendmail-perl \ - libmailtools-perl \ - libmime-lite-perl \ - libmime-types-perl \ - libmodule-implementation-perl \ - libmodule-runtime-perl \ - libmoose-perl \ - libmouse-perl \ - libmro-compat-perl \ - libnet-dns-perl \ - libnet-http-perl \ - libnet-libidn-perl \ - libnet-oauth-perl \ - libnet-ssleay-perl \ - libpackage-deprecationmanager-perl \ - libpackage-stash-perl \ - libparse-recdescent-perl \ - libplack-perl \ - libpod-simple-perl \ - libproc-processtable-perl \ - libreadonly-perl \ - libreadonly-xs-perl \ - libroman-perl \ - libsoap-lite-perl \ - libspreadsheet-writeexcel-perl \ - libsub-exporter-progressive-perl \ - libsub-name-perl \ - libtemplate-perl \ - libterm-readkey-perl \ - libterm-readline-gnu-perl \ - libtest-requiresinternet-perl \ - libtest-simple-perl \ - libtie-ixhash-perl \ - libtimedate-perl \ - libtry-tiny-perl \ - libuniversal-require-perl \ - liburi-encode-perl \ - libuuid-perl \ - libuuid-tiny-perl \ - libversion-perl \ - libwww-perl \ - libwww-robotrules-perl \ - libxml-dom-perl \ - libxml-libxml-perl \ - libxml-libxslt-perl \ - libxml-sax-perl \ - libxml-simple-perl \ - libxml-writer-perl \ - libyaml-appconfig-perl \ - libyaml-libyaml-perl \ - libyaml-perl \ - libmarc-record-perl \ - libmarc-xml-perl - -RUN apt-get install -y \ - autoconf \ - bison \ - build-essential \ - git \ - libdevel-cover-perl \ - libffi-dev \ - libgdbm-dev \ - libncurses5-dev \ - libreadline6-dev \ - libsqlite3-dev \ - libssl-dev \ - libyaml-dev \ - openssh-server \ - unzip \ - wget \ - zip \ - zlib1g-dev \ - netcat \ - libperl-critic-perl - -RUN apt-get install -y libtest-class-perl libswitch-perl libtest-spec-perl libtest-mockobject-perl - -RUN apt-get install -y apache2 apache2-utils vim - -RUN cpan -T \ - File::Pairtree \ - URI::Escape \ - CGI::PSGI \ - IP::Geolocation::MMDB - -WORKDIR htapps/babel/geoip -RUN wget https://github.com/maxmind/MaxMind-DB/blob/main/test-data/GeoIP2-Country-Test.mmdb?raw=true -O GeoIP2-Country.mmdb - -RUN ln -s /tmp /ram - -RUN mkdir -p /l/local/bin -RUN ln -s /usr/bin/unzip /l/local/bin/unzip -RUN ln -s /usr/bin/plackup /l/local/bin/plackup - -WORKDIR /tmp -COPY ./imgsrv/vendor/KDU7A2_Demo_Apps_for_Ubuntu-x86-64_170827.zip /tmp -RUN unzip -j -d /tmp/kakadu KDU7A2_Demo_Apps_for_Ubuntu-x86-64_170827.zip -# RUN wget https://kakadusoftware.com/wp-content/uploads/2014/06/KDU7A2_Demo_Apps_for_Ubuntu-x86-64_170827.zip -# RUN unzip -j -d kakadu KDU7A2_Demo_Apps_for_Ubuntu-x86-64_170827.zip -RUN mv /tmp/kakadu/*.so /usr/local/lib -RUN mv /tmp/kakadu/kdu* /usr/local/bin -RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/kakadu.conf -RUN ldconfig - -RUN mkdir -p /l/local/bin -RUN ln -s /usr/bin/convert /l/local/bin/convert -RUN ln -s /usr/local/bin/kdu_expand /l/local/bin/kdu_expand -RUN ln -s /usr/local/bin/kdu_compress /l/local/bin/kdu_compress -RUN /bin/bash -c 'for cmd in pamflip jpegtopnm tifftopnm bmptopnm pngtopam ppmmake pamcomp pnmscalefixed pamscale pnmrotate pnmpad pamtotiff pnmtotiff pnmtojpeg pamrgbatopng ppmtopgm pnmtopng; do ln -s /usr/bin/$cmd /l/local/bin; done' - -WORKDIR /htapps/babel/cache -RUN mkdir imgsrv -RUN chown -R www-data . -RUN chmod -R 4777 . - -WORKDIR /htapps/babel/logs -RUN chown -R www-data . -RUN chmod -R 4777 . - -COPY ./mdp-lib /htapps/babel/mdp-lib -COPY ./plack-lib /htapps/babel/plack-lib -COPY ./slip-lib /htapps/babel/slip-lib -COPY ./mdp-web /htapps/babel/mdp-web - -WORKDIR /htapps/babel/pt -RUN ln -s /htapps/babel /htapps/test.babel - -COPY ./pt /htapps/babel/pt -RUN echo -e "debug_local = 1\ndebug_enabled = 1\nmdpitem_use_cache=false\n" > lib/Config/local.conf -RUN chgrp -R www-data /htapps/babel/pt - -WORKDIR /htapps/babel/imgsrv -COPY ./imgsrv /htapps/babel/imgsrv -RUN echo -e "debug_local=1\ndebug_enabled=1\nmdpitem_use_cache=false\n" > lib/Config/local.conf -RUN chgrp -R www-data /htapps/babel/imgsrv - -RUN ln -s /etc/apache2/mods-available/rewrite.load /etc/apache2/mods-enabled -RUN ln -s /etc/apache2/mods-available/cgi.load /etc/apache2/mods-enabled -RUN ln -s /etc/apache2/mods-available/proxy.load /etc/apache2/mods-enabled -RUN ln -s /etc/apache2/mods-available/proxy_fcgi.load /etc/apache2/mods-enabled -RUN ln -s /etc/apache2/mods-available/proxy_http.load /etc/apache2/mods-enabled - -COPY ./babel-local-dev/docker/000-default.conf /etc/apache2/sites-enabled/000-default.conf - -# CMD [ "/usr/sbin/apache2", "-D", "FOREGROUND"] -CMD [ "/usr/sbin/apache2ctl", "-D", "FOREGROUND" ] diff --git a/README.md b/README.md index 6c3d1b93b..362176a98 100644 --- a/README.md +++ b/README.md @@ -8,34 +8,28 @@ Clone all the repositories in a working directory. We're going to be running docker from this working directory, so `babel-local-dev` has access to the other repositories. -There's a lot, because we're replicating running on the -dev servers with `debug_local=1` enabled. - -``` -$ mkdir workdir -$ cd workdir -$ git clone git@github.com:hathitrust/babel-local-dev.git -$ git clone git@github.com:hathitrust/catalog.git -$ git clone git@github.com:hathitrust/common.git -$ git clone git@github.com:hathitrust/imgsrv.git -$ git clone git@github.com:hathitrust/pt.git -$ git clone git@github.com:hathitrust/mdp-lib.git -$ git clone git@github.com:hathitrust/slip-lib.git -$ git clone git@github.com:hathitrust/plack-lib.git -$ git clone git@github.com:hathitrust/imgsrv-sample-data.git -# more to come +First clone this repository: +```bash +git clone git@github.com:hathitrust/babel-local-dev.git babel ``` -## Step 2: intialize all the submodules +Then run: -*Insert fancy one liner if available.* +```bash +cd babel +./setup.sh +``` + +This will check out the other repositories along with their submodules. +There's a lot, because we're replicating running on the dev servers with +`debug_local=1` enabled. ## Step 3: build the `babel-local-dev` environment In your workdir: ``` -docker-compose -f ./babel-local-dev/docker-compose.yml build +docker-compose build ``` ## Step 4: run `babel-local-dev`: @@ -43,29 +37,83 @@ docker-compose -f ./babel-local-dev/docker-compose.yml build In your workdir: ``` -docker-compose -f ./babel-local-dev/docker-compose.yml up +docker-compose up ``` In your browser: -* http://localhost:8080/Search/Home -* http://localhost:8080/cgi/pt?id=test.pd_open +* catalog: `http://localhost:8080/Search/Home` +* catalog solr: `http://localhost:9033` +* full-text solr: `http://localhost:8983` + +PageTurner & imgsrv: +* `http://localhost:8080/cgi/pt?id=test.pd_open` +* `http://localhost:8080/cgi/imgsrv/cover?id=test.pd_open` +* `http://localhost:8080/cgi/imgsrv/image?id=test.pd_open&seq=1` +* `http://localhost:8080/cgi/imgsrv/html?id=test.pd_open&seq=1` +* `http://localhost:8080/cgi/imgsrv/download/pdf?id=test.pd_open&seq=1&attachment=0` + +mysql is exposed at 127.0.0.1:3307. The default username & password with write +access is `mdp-admin` / `mdp-admin` (needless to say, do not use this image in +production!) + +```bash +mysql -h 127.0.0.1 -p 3307 -u mdp-admin -p +``` Huzzah! +Not yet configured: +* `http://localhost:8080/cgi/mb` +* `http://localhost:8080/cgi/ls` +* `http://localhost:8080/cgi/whoami` +* `http://localhost:8080/cgi/ping` +* etc + ## How this works (for now) -The `docker-commpose` provides a custom catalog configuration to the `nginx` service to -proxy `babel` CGI requests to the `apache-cgi` service, and serve `common` requests from -the local `common` checkout. +* catalog runs nginx + php +* babel cgi apps run under apache in a single container +* imgsrv plack/psgi process runs in its own container + +## Staging an Item -`apache-cgi` is there because `nginx` can only speak FastCGI/HTTP and running *all* the babel -apps under FastCGI/HTTP is still aspirational. +First, get a HathiTrust ZIP and METS. The easiest way to do this is probably by +using the [Data API client](https://babel.hathitrust.org/cgi/htdc) to download +a public domain item unencumbered by any contractual restrictions, for example +`uc2.ark:/13960/t4mk66f1d`. Select "Download" and in turn select "Item METS +file" and "entire item" and submit the form; this will download the ZIP and +METS respectively. + +Running the stage item script requires a Ruby runtime. It will automate putting +the item in the appropriate location under `imgsrv-sample-data`, fetch the +bibliographic data, and extract and index the full text. + +First make sure all the dependencies are running: + +```bash +docker-compose build +docker-compose up +``` + +Then, install dependencies for the `stage-item` script and run it with the +downloaded zip and METS: + +```bash +cd stage-item +bundle config set --local path 'vendor/bundle' +bundle install +bundle exec ruby stage_item.rb uc2.ark:/13960/t4mk66f1d ark+=13960=t4mk66f1d.zip ark+=13960=t4mk66f1d.mets.xml +``` + +Note that the zip and METS must be named as they are in the actual +repository -- if you name them "foo.zip" or "foo.xml" they will not be renamed, +and full-text indexing and PageTurner will not be able to find the item. ## TODO -- [ ] merge the `imgsrv` DEV-231-grok branch and update the `Dockerfile`s to include `grok` -- [ ] update `slip-lib/Searcher.pm` to set `wt=xml` because the new solr defaults return JSON -- [ ] adding `pt` requires filling out more of the `ht_web` tables (namely `mb_*`) +- [ ] add `mb` and `ls` +- [ ] ensure database user can write to relevant tables +- [ ] link to documentation for important tasks - e.g. running apps under debugging, updating css/js, etc - [ ] easy mechanism to generate placeholder volumes in `imgsrv-sample-data` that correspond to the records in the catalog - +- [ ] make it easier to fetch real volumes diff --git a/cache/.keep b/cache/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/docker-compose.yml b/docker-compose.yml index 82dc2e360..3ceeb1753 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,93 +8,144 @@ services: - "8080:8080" volumes: - ./docker/nginx-default.conf:/etc/nginx/conf.d/default.conf - - ../catalog:/app - - ../common:/app/common-dev + - ./catalog:/app + - ./common:/app/common-dev depends_on: - vufind - imgsrv + - apache-cgi - apache-cgi: - build: - context: ".." - dockerfile: "./babel-local-dev/Dockerfile" + imgsrv: + build: ./docker/babel-base + user: ${CURRENT_USER} + image: hathitrust-babel-base volumes: - - ../catalog:/htapps/app - - ../common:/htapps/babel/common - - ../imgsrv:/htapps/babel/imgsrv - - ../pt:/htapps/babel/pt - - ../mdp-web:/htapps/babel/mdp-web - - ../mdp-lib:/htapps/babel/mdp-lib - - ../slip-lib:/htapps/babel/slip-lib - - ../plack-lib:/htapps/babel/plack-lib - - "../imgsrv-sample-data/sdr1:/sdr1" - - "../imgsrv-sample-data/etc:/htapps/babel/etc" - - "../imgsrv-sample-data/watermarks:/htapps/babel/watermarks" - - "../imgsrv-sample-data:/tmp/imgsrv-sample-data" + - ./imgsrv:/htapps/babel/imgsrv + - ./logs:/htapps/babel/logs + - ./cache:/htapps/babel/cache + - "./sample-data/sdr1:/sdr1" + - "./sample-data/etc:/htapps/babel/etc" + - "./sample-data/watermarks:/htapps/babel/watermarks" + - "./sample-data:/tmp/sample-data" environment: - SDRROOT=/htapps/babel - SDRDATAROOT=/sdr1 - - HT_DEV= - - MARIADB_USER=ht_web + - HT_DEV=docker - REMOTE_ADDR=127.0.0.1 - HTTP_HOST=127.0.0.1 + - APACHE_LOG_DIR=/tmp depends_on: - mysql-sdr - solr-sdr-catalog ports: - - "41028:41028" + - "31028:31028" + command: bash -c "/htapps/babel/imgsrv/bin/startup_imgsrv" - # assumes imgsrv-sample-data has been checked out next to "imgsrv" - imgsrv: - build: ../imgsrv + apache-cgi: + build: ./docker/apache-cgi + user: ${CURRENT_USER} volumes: - - ../imgsrv:/htapps/babel/imgsrv - - "../imgsrv-sample-data/sdr1:/sdr1" - - "../imgsrv-sample-data/etc:/htapps/babel/etc" - - "../imgsrv-sample-data/watermarks:/htapps/babel/watermarks" - - "../imgsrv-sample-data:/tmp/imgsrv-sample-data" + - "./docker/apache-cgi/000-default.conf:/etc/apache2/sites-enabled/000-default.conf" + - ./imgsrv:/htapps/babel/imgsrv + - ./logs:/htapps/babel/logs + - ./cache:/htapps/babel/cache + - ./pt:/htapps/babel/pt + - ./ssd:/htapps/babel/ssd + - ./common:/htapps/babel/common + - "./sample-data/sdr1:/sdr1" + - "./sample-data/etc:/htapps/babel/etc" + - "./sample-data/watermarks:/htapps/babel/watermarks" + - "./sample-data:/tmp/sample-data" environment: - SDRROOT=/htapps/babel - SDRDATAROOT=/sdr1 - - HT_DEV= - - MARIADB_USER=ht_web - - REMOTE_ADDR=127.0.0.1 - - HTTP_HOST=127.0.0.1 depends_on: - mysql-sdr - solr-sdr-catalog ports: - - "31028:31028" - command: bash -c "/htapps/babel/imgsrv/bin/startup_imgsrv" + - "41028:41028" + + vufind: + build: ./catalog + hostname: 'catalog-dev' + volumes: + - ./catalog:/app + depends_on: + - mysql-sdr + - solr-sdr-catalog + +#### DATA STORES solr-sdr-catalog: image: ghcr.io/hathitrust/catalog-solr-sample ports: - "9033:9033" + solr-lss-dev: + image: solr:6 + ports: + - "8983:8983" + user: ${CURRENT_USER} + volumes: + - ./lss_solr_configs/lss-dev/core-x:/opt/solr/server/solr/core-x + - ./lss_solr_configs/lss-dev/core-y:/opt/solr/server/solr/core-y + - ./lss_solr_configs:/opt/lss_solr_configs + - ./lss_solr_configs/lib:/opt/solr/server/solr/lib + - ./logs/solr:/opt/solr/server/logs + mysql-sdr: - image: mariadb + image: ghcr.io/hathitrust/db-image:latest volumes: - - ../catalog/docker/vufind.sql:/docker-entrypoint-initdb.d/vufind.sql - - ../imgsrv/vendor/common-lib/lib/sql/000_ht_schema.sql:/docker-entrypoint-initdb.d/0000_ht_schema.sql - - ../imgsrv/vendor/common-lib/lib/sql/001_ht_ht_namespaces.sql:/docker-entrypoint-initdb.d/001_ht_ht_namespaces.sql - - ../imgsrv/vendor/common-lib/lib/sql/002_ht_rights_current.sql:/docker-entrypoint-initdb.d/002_ht_rights_current.sql - - ../imgsrv/sql/100_ht_web_schema.sql:/docker-entrypoint-initdb.d/100_ht_web_schema.sql - - ../imgsrv/sql/200_users.sql:/docker-entrypoint-initdb.d/200_users.sql + - ./slip/etc/sql/100_slip.sql:/docker-entrypoint-initdb.d/100_slip.sql + - ./catalog/docker/vufind.sql:/docker-entrypoint-initdb.d/101_vufind.sql + ports: + - "3307:3306" environment: - # - MARIADB_RANDOM_ROOT_PASSWORD=1 + # - mysql-sdr_RANDOM_ROOT_PASSWORD=1 MYSQL_ROOT_PASSWORD: TIMTOWTDIBSCINABTE - vufind: - build: ../catalog - hostname: 'catalog-dev' +#### INDEXING + + # We add the 'indexing' profile to keep these from starting automatically + # when we do 'docker-compose up' + + slip: + build: ./slip + image: hathitrust-slip + user: ${CURRENT_USER} volumes: - - ../catalog:/app + - ./slip:/htapps/babel/slip + - ./logs:/htapps/babel/logs + # this is where docs-j saves output + - ./slip/sample:/htapps/babel/logs/tmp + - "./sample-data/sdr1:/sdr1" + - "./sample-data/etc:/htapps/babel/etc" + - "./sample-data:/tmp/sample-data" + environment: + - SDRROOT=/htapps/babel + - SDRDATAROOT=/sdr1 + - HT_DEV=docker depends_on: - mysql-sdr - solr-sdr-catalog + command: bash + profiles: + - indexing + + traject: + build: ./hathitrust_catalog_indexer + user: ${CURRENT_USER} + environment: + - SOLR_URL=http://solr-sdr-catalog:9033/solr/catalog + - redirect_file=/dev/null + - NO_DB=1 + - DDIR=/app/metadata + depends_on: + - solr-sdr-catalog + volumes: + - "./sample-data/metadata:/app/metadata" + profiles: + - indexing -volumes: - # sdr1: - data_db: + # todo: ingest, bound to sample-data diff --git a/docker/000-default.conf b/docker/000-default.conf deleted file mode 100644 index a23ff5ec2..000000000 --- a/docker/000-default.conf +++ /dev/null @@ -1,75 +0,0 @@ -Listen 41028 - - # The ServerName directive sets the request scheme, hostname and port that - # the server uses to identify itself. This is used when creating - # redirection URLs. In the context of virtual hosts, the ServerName - # specifies what hostname must appear in the request's Host: header to - # match this virtual host. For the default virtual host (this file) this - # value is not decisive as it is used as a last resort host regardless. - # However, you must set it for any further virtual host explicitly. - #ServerName www.example.com - - ServerAdmin webmaster@localhost - DocumentRoot /htapps/babel - - # Available loglevels: trace8, ..., trace1, debug, info, notice, warn, - # error, crit, alert, emerg. - # It is also possible to configure the loglevel for particular - # modules, e.g. - #LogLevel info ssl:warn - - # ErrorLog ${APACHE_LOG_DIR}/error.log - # CustomLog ${APACHE_LOG_DIR}/access.log combined - - LogLevel trace8 - ErrorLog /dev/stderr - CustomLog /dev/stdout combined - - # For most configuration files from conf-available/, which are - # enabled or disabled at a global level, it is possible to - # include a line for only one particular virtual host. For example the - # following line enables the CGI configuration for this host only - # after it has been globally disabled with "a2disconf". - - Include conf-available/serve-cgi-bin.conf - RewriteEngine On - - ## SetEnv/SetEnvIf for environment variables - SetEnv SDRROOT /htapps/babel - SetEnv SDRDATAROOT /sdr1 - SetEnv ASSERTION_EMAIL hathitrust-system@umich.edu - SetEnv HT_DEV www-data - # SetEnv HT_IGNORE_GEOIP true - - # SetEnv PTSEARCH_SOLR https://testing.ptsearch.kubernetes.hathitrust.org:8443/solr/ptsearch - # SetEnv PTSEARCH_SOLR_BASIC_AUTH c29scjpwY1hoMVQxTVF4eExoRUNjSVZPME43MDc2Vk1WdzdUYms= - - - - Require not env badrobot - Require not env loadbalancer - Require all granted - - - - - Options +ExecCGI - SetHandler cgi-script - - - RewriteCond %{DOCUMENT_ROOT}/babel/$1/web/$2 -f - RewriteRule ^/([^/]+)/(.*) /$1/web/$2 [last] - - RewriteCond %{DOCUMENT_ROOT}/babel/$1/web/ -d - RewriteRule ^/([^/]+)/?$ /$1/web/ [last] - - RewriteCond %{DOCUMENT_ROOT}/babel/$2/cgi/$3 -f - RewriteRule ^/(shcgi|cgi)/([^/]+)/([^/]+)(.*)$ /$2/cgi/$3$4 [skip] - - RewriteCond %{DOCUMENT_ROOT}/babel/$2/cgi/$2 -f - RewriteRule ^/(shcgi|cgi)/([^/]+)(.*)$ /$2/cgi/$2$3 - - RewriteCond %{DOCUMENT_ROOT}/babel/$1/cgi/$3.choke -f - RewriteRule ^/([^/]+)/(shcgi|cgi)/([^/]+)(.*)$ /$1/cgi/$3.choke$4 [last] - - \ No newline at end of file diff --git a/docker/apache-cgi/000-default.conf b/docker/apache-cgi/000-default.conf new file mode 100644 index 000000000..318f32a94 --- /dev/null +++ b/docker/apache-cgi/000-default.conf @@ -0,0 +1,54 @@ +ServerName apache-cgi +ErrorLog /dev/stdout +CustomLog /dev/stdout combined +Listen 41028 + + + ServerAdmin hathitrust@localhost + DocumentRoot /htapps/babel + + LogLevel debug + ErrorLog /dev/stdout + CustomLog /dev/stdout combined + + RewriteEngine On + + ## SetEnv/SetEnvIf for environment variables + SetEnv SDRROOT /htapps/babel + SetEnv SDRDATAROOT /sdr1 + SetEnv ASSERTION_EMAIL hathitrust-system@umich.edu + SetEnv HT_DEV www-data + # SetEnv HT_IGNORE_GEOIP true + + # SetEnv PTSEARCH_SOLR https://testing.ptsearch.kubernetes.hathitrust.org:8443/solr/ptsearch + # SetEnv PTSEARCH_SOLR_BASIC_AUTH c29scjpwY1hoMVQxTVF4eExoRUNjSVZPME43MDc2Vk1WdzdUYms= + + + Options Indexes FollowSymLinks + AllowOverride All + Require all granted + + + + Options +ExecCGI + SetHandler cgi-script + + + RewriteCond %{DOCUMENT_ROOT}/$1/web/$2 -f + RewriteRule ^/([^/]+)/(.*) /$1/web/$2 [last] + + RewriteCond %{DOCUMENT_ROOT}/$1/web/ -d + RewriteRule ^/([^/]+)/?$ /$1/web/ [last] + + RewriteCond %{DOCUMENT_ROOT}/$2/cgi/$3 -f + RewriteRule ^/(cgi)/([^/]+)/([^/]+)(.*)$ /$2/cgi/$3$4 [skip] + + RewriteCond %{DOCUMENT_ROOT}/$2/cgi/$2 -f + RewriteRule ^/(cgi)/([^/]+)(.*)$ /$2/cgi/$2$3 + + RewriteCond %{DOCUMENT_ROOT}/$1/cgi/$3.choke -f + RewriteRule ^/([^/]+)/(cgi)/([^/]+)(.*)$ /$1/cgi/$3.choke$4 [last] + + # TODO include for auth env vars + + diff --git a/docker/apache-cgi/Dockerfile b/docker/apache-cgi/Dockerfile new file mode 100644 index 000000000..d9d5529c6 --- /dev/null +++ b/docker/apache-cgi/Dockerfile @@ -0,0 +1,21 @@ +FROM hathitrust-babel-base + +RUN apt-get -y install apache2 libapache2-mod-fcgid + +RUN a2dissite '*' +RUN a2disconf other-vhosts-access-log +RUN a2dismod 'mpm_*' +RUN a2enmod headers \ + mpm_prefork \ + rewrite \ + proxy \ + proxy_fcgi \ + proxy_http \ + cgi + +COPY 000-default.conf /etc/apache2/sites-enabled +STOPSIGNAL SIGWINCH + +COPY apache.sh / +RUN chmod +x /apache.sh +ENTRYPOINT ["/apache.sh"] diff --git a/docker/apache-cgi/apache.sh b/docker/apache-cgi/apache.sh new file mode 100755 index 000000000..a42d1f1ba --- /dev/null +++ b/docker/apache-cgi/apache.sh @@ -0,0 +1,21 @@ +#! /bin/bash + +# Apache gets grumpy about PID files pre-existing +if [ ! -d /tmp/apache2 ] +then + mkdir -p /tmp/apache2/{run,lock,log} +fi + +rm -f /tmp/apache2/apache2*.pid + +export APACHE_PID_FILE=/tmp/apache2/run/apache2.pid +export APACHE_RUN_DIR=/tmp/apache2/run +export APACHE_LOCK_DIR=/tmp/apache2/lock +export APACHE_LOG_DIR=/tmp/apache2/log + +# Won't be effective if we pass user from docker-compose; that's OK - hence +# shenanigans above +export APACHE_RUN_USER=www-data +export APACHE_RUN_GROUP=www-data + +exec apache2 -DFOREGROUND diff --git a/docker/babel-base/Dockerfile b/docker/babel-base/Dockerfile new file mode 100644 index 000000000..b7866ebba --- /dev/null +++ b/docker/babel-base/Dockerfile @@ -0,0 +1,82 @@ +FROM debian:bookworm + +# # does not work bookworm - evaluate if it's needed +# RUN sed -i 's/main.*/main contrib non-free/' /etc/apt/sources.list + +RUN apt-get update && apt-get install -y \ + autoconf \ + bison \ + build-essential \ + cpanminus \ + curl \ + file \ + git \ + grokj2k-tools \ + imagemagick \ + libapache-session-perl \ + libconfig-tiny-perl \ + libdate-calc-perl \ + libdate-manip-perl \ + libdbd-mysql-perl \ + libdevel-cover-perl \ + libfcgi-perl \ + libfcgi-procmanager-perl \ + libimage-exiftool-perl \ + libimage-info-perl \ + libimage-size-perl \ + libio-string-perl \ + libipc-run-perl \ + libjson-xs-perl \ + liblist-moreutils-perl \ + libmailtools-perl \ + libmime-types-perl \ + libnet-dns-perl \ + libplack-perl \ + libtest-class-perl \ + libtry-tiny-perl \ + libxml-libxml-perl \ + libxml-libxslt-perl \ + netcat-traditional \ + netpbm \ + perl \ + procps \ + starman \ + unzip \ + uuid-dev \ + zip \ + zlib1g-dev + +RUN cpanm --notest \ + File::Pairtree \ + URI::Escape \ + CGI::PSGI \ + IP::Geolocation::MMDB \ + UUID + +WORKDIR /htapps/babel/geoip +ADD --chmod=644 https://github.com/maxmind/MaxMind-DB/blob/main/test-data/GeoIP2-Country-Test.mmdb?raw=true GeoIP2-Country.mmdb + +RUN ln -s /tmp /ram + +RUN mkdir -p /l/local/bin +RUN ln -s /usr/bin/unzip /l/local/bin/unzip +RUN ln -s /usr/bin/convert /l/local/bin/convert +RUN ln -s /usr/bin/plackup /l/local/bin/plackup +RUN /bin/bash -c 'for cmd in pamflip jpegtopnm tifftopnm bmptopnm pngtopam ppmmake pamcomp pnmscalefixed pamscale pnmrotate pnmpad pamtotiff pnmtotiff pnmtojpeg pamrgbatopng ppmtopgm pnmtopng; do ln -s /usr/bin/$cmd /l/local/bin; done' + +WORKDIR /htapps/babel/imgsrv + +RUN mkdir /htapps/babel/cache +RUN chmod 4777 /htapps/babel/cache + +RUN mkdir /htapps/babel/logs +RUN chmod 4777 /htapps/babel/logs + +RUN ln -s /htapps/babel /htapps/test.babel +RUN cd /htapps/babel + +COPY . /htapps/babel/imgsrv +RUN ln -s imgsrv/vendor/common-lib/lib ../mdp-lib +RUN ln -s imgsrv/web/common-web ../mdp-web + +CMD ["/htapps/babel/imgsrv/bin/startup_imgsrv"] diff --git a/docker/nginx-default.conf b/docker/nginx-default.conf index 417a954fa..1b6b71aaa 100644 --- a/docker/nginx-default.conf +++ b/docker/nginx-default.conf @@ -76,9 +76,13 @@ server { } location ~ /cgi/imgsrv/(image|thumbnail|meta|html|ocr|cover)(.*) { - rewrite /cgi/imgsrv/(.*)$ /$1 break; - proxy_pass http://imgsrv:31028; - # proxy_pass http://imgsrv:31028/$1$2; + fastcgi_split_path_info ^(/cgi/imgsrv)(/.+)$; + # try_files $fastcgi_script_name =404; + set $path_info $fastcgi_path_info; + fastcgi_param PATH_INFO $path_info; + fastcgi_pass imgsrv:31028; + fastcgi_param SCRIPT_FILENAME $fastcgi_script_name; + include fastcgi_params; } location /cgi/imgsrv { @@ -96,6 +100,16 @@ server { proxy_pass http://apache-cgi:41028/pt; } + location /cgi/ssd { + # rewrite /cgi/pt/(.*)$ /cgi/pt/$1 break; + proxy_pass http://apache-cgi:41028/cgi/ssd; + } + + location /ssd { + rewrite /ssd/(.*)$ /ssd/$1 break; + proxy_pass http://apache-cgi:41028/ssd; + } + location / { try_files $uri @rewrite; } diff --git a/logs/.keep b/logs/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/logs/solr/.keep b/logs/solr/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/setup.sh b/setup.sh new file mode 100755 index 000000000..c398a5b03 --- /dev/null +++ b/setup.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +cat <> .env +echo "APACHE_RUN_USER=$(id -u)" >> .env +echo "APACHE_RUN_GROUP=$(id -g)" >> .env + +# Do we need these separately? +# git clone $GIT_BASE/mdp-lib.git +# git clone $GIT_BASE/slip-lib.git +# git clone $GIT_BASE/plack-lib.git diff --git a/stage-item/.env b/stage-item/.env new file mode 120000 index 000000000..4a82335f5 --- /dev/null +++ b/stage-item/.env @@ -0,0 +1 @@ +../.env \ No newline at end of file diff --git a/stage-item/Gemfile b/stage-item/Gemfile new file mode 100644 index 000000000..04cc7d17a --- /dev/null +++ b/stage-item/Gemfile @@ -0,0 +1,19 @@ +# frozen_string_literal: true + +source "https://rubygems.org" + +git_source(:github) { |repo_name| "https://github.com/#{repo_name}" } + +# gem "rails" + +gem "marc", "~> 1.2" +gem "faraday", "~> 2.7" +gem "faraday-follow_redirects" +gem "ht-pairtree", github: "hathitrust/ht-pairtree", tag: 'v0.1.0' +gem "mysql2" +gem "sequel" + +group :development do + gem "pry" + gem "pry-byebug" +end diff --git a/stage-item/Gemfile.lock b/stage-item/Gemfile.lock new file mode 100644 index 000000000..68f602c2b --- /dev/null +++ b/stage-item/Gemfile.lock @@ -0,0 +1,55 @@ +GIT + remote: https://github.com/hathitrust/ht-pairtree + revision: a8620c464438c8de7ad9a492b1fd5bd7f0756079 + tag: v0.1.0 + specs: + ht-pairtree (0.1.0) + pairtree (~> 0.3) + +GEM + remote: https://rubygems.org/ + specs: + byebug (11.1.3) + coderay (1.1.3) + faraday (2.7.4) + faraday-net_http (>= 2.0, < 3.1) + ruby2_keywords (>= 0.0.4) + faraday-follow_redirects (0.3.0) + faraday (>= 1, < 3) + faraday-net_http (3.0.2) + marc (1.2.0) + rexml + scrub_rb (>= 1.0.1, < 2) + unf + method_source (1.0.0) + mysql2 (0.5.5) + pairtree (0.3.0) + pry (0.14.2) + coderay (~> 1.1) + method_source (~> 1.0) + pry-byebug (3.10.1) + byebug (~> 11.0) + pry (>= 0.13, < 0.15) + rexml (3.2.5) + ruby2_keywords (0.0.5) + scrub_rb (1.0.1) + sequel (5.66.0) + unf (0.1.4) + unf_ext + unf_ext (0.0.8.2) + +PLATFORMS + x86_64-linux + +DEPENDENCIES + faraday (~> 2.7) + faraday-follow_redirects + ht-pairtree! + marc (~> 1.2) + mysql2 + pry + pry-byebug + sequel + +BUNDLED WITH + 2.2.22 diff --git a/stage-item/stage_item.rb b/stage-item/stage_item.rb new file mode 100644 index 000000000..b99cd3dc5 --- /dev/null +++ b/stage-item/stage_item.rb @@ -0,0 +1,160 @@ +#!ruby + +require "faraday" +require "faraday/follow_redirects" +require "fileutils" +require "ht/pairtree" +require "marc" +require "sequel" +require "tempfile" + +# The parent of babel-local-dev where all the HathiTrust repos are checked out +HTDEV_ROOT = ENV["HTDEV_ROOT"] || File.realpath(File.join(__dir__,"..")) + +METADATA_ROOT = ENV["METADATA_ROOT"] || File.join(HTDEV_ROOT,"sample-data","metadata") +SDRDATAROOT = ENV["SDRDATAROOT"] || File.join(HTDEV_ROOT,"sample-data","sdr1") +CATALOG_BASE = ENV["CATALOG_BASE"] || "https://catalog.hathitrust.org" +MYSQL_URL = ENV["MYSQL_URL"] || "mysql2://mdp-admin:mdp-admin@127.0.0.1:3307/ht" +CATALOG_SOLR = ENV["CATALOG_SOLR"] || "http://localhost:9033/solr/catalog" +LSS_SOLR = ENV["LSS_SOLR"] || "http://localhost:8983/solr/core-x" + +class StageItem + attr_reader :htid, :namespace, :objid, :zip, :mets, :pt + + def self.main + usage unless ARGV.length == 3 + + StageItem.new(*ARGV).run + end + + def initialize(htid, zip, mets) + @htid = htid + (@namespace, @objid) = htid.split(".",2) + @zip = zip + @mets = mets + + self.class.usage unless [@zip, @mets].all? { |f| File.exist?(f) } && + @zip.match?(/\.zip$/) && @mets.match?(/\.xml$/) + + @pt = HathiTrust::Pairtree.new(root: File.join(SDRDATAROOT,'obj')) + end + + def run + stage_content + stage_metadata + index_full_text + end + + def stage_metadata + Tempfile.create(["metadata", ".json"], File.realpath(METADATA_ROOT)) do |f| + metadata = fetch_metadata(f) + populate_database(metadata) + index_metadata(File.basename(f.path)) + end + end + + def stage_content + pt.create(htid, new_namespace_allowed: true) + repo_path = pt.path_for(htid) + + puts("↪️ Copying zip and mets to repo #{repo_path}\n") + FileUtils.cp([zip,mets],repo_path) + end + + def fetch_metadata(tempfile) + url = "/Record/HTID/#{htid}.json" + puts "📙 Getting metadata #{CATALOG_BASE}#{url} and saving to tempfile #{tempfile.path}\n" + + conn = Faraday.new(CATALOG_BASE) do |f| + f.response :follow_redirects + end + + json = conn.get("/Record/HTID/#{htid}.json").body + + tempfile.write(json) + tempfile.flush + + MARC::Record.new_from_hash(JSON.parse(json)) + end + + def populate_database(record) + catalog_id = record["001"].value + + # each item has a 974 field; the HTID is in 974$u + item_data = record.fields("974").find { |f| f["u"] == htid } + raise "Can't find item data for #{htid} in record #{catalog_id}" unless item_data + + rights_attr = item_data["r"] + rights_reason = item_data["q"] + rights_source = item_data["s"] + zephir_update_date = item_data["d"] + + # Simplification for the purposes of testing data: for now, the access + # profile is 2 ('google') if the item was digitized by google and 1 + # ('open') otherwise. We can add options later to override all the rights + # stuff for purposes of testing. + access_profile = (rights_source == 'google') ? 2 : 1 + + dbh = Sequel.connect(MYSQL_URL) + + sql = <<~SQL + REPLACE INTO rights_current (namespace, id, attr, reason, source, access_profile, user, note) VALUES + (?, ?, + (SELECT id FROM attributes WHERE name = ?), + (SELECT id FROM reasons WHERE name = ?), + (SELECT id FROM sources WHERE name = ?), + ?,'stage-item','staged from catalog record by stage_item.rb') + SQL + + values = [namespace, objid, rights_attr, rights_reason, rights_source, access_profile] + dbh[sql,*values].insert + + sql = <<~SQL + REPLACE INTO slip_rights (nid, attr, reason, source, user, time, sysid, update_time) + SELECT concat(namespace, '.', id), attr, reason, source, user, time, ?, ? FROM rights_current WHERE namespace = ? and id = ? + SQL + + values = [catalog_id, zephir_update_date, namespace, objid] + dbh[sql,*values].insert + end + + def index_metadata(file) + puts "📕 Indexing metadata..." + + catalog_utils_sh = File.join(HTDEV_ROOT,"hathitrust_catalog_indexer","bin","utils.sh") + system("docker-compose run traject bin/index_file metadata/#{file}") + system("bash -c 'source #{catalog_utils_sh}; solr_url; commit'") + end + + def index_full_text + puts "📖 Indexing full text..." + + system("docker-compose run slip index/docs-j -r11 -I#{htid}") + + slip_sample_dir = File.join(HTDEV_ROOT,"slip","sample") + load_into_solr_sh = File.join(HTDEV_ROOT,"slip","sample","load_into_solr.sh") + pt_objid = File.basename(mets,".mets.xml") + + system("bash #{slip_sample_dir}/load_into_solr.sh #{slip_sample_dir}/#{pt_objid}*.solr.xml") + end + + def self.usage + STDERR.puts <<~EOT + Usage: $0 namespace.barcode some_item.zip some_item.mets.xml + + where htid is something like "namespace.objid". + + Stages an item into the sample repository from a given zip and XML file. It: + * fetches metadata from the catalog + * indexes this into the sample catalog + * populates the rights_current and slip_rights table + * indexes the full text + EOT + + exit 1 + end +end + +if $0 == __FILE__ + StageItem.main +end