diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..11c996036
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,21 @@
+vendor
+.bundle
+.env
+stage-item/*.xml
+stage-item/*.zip
+
+# other repositories
+
+catalog/
+common/
+hathitrust_catalog_indexer/
+ht-pairtree/
+imgsrv-sample-data/
+imgsrv/
+lss_solr_configs/
+pt/
+sample-data/
+slip/
+ssd/
+logs/
+cache/
diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index aa7a80e81..000000000
--- a/Dockerfile
+++ /dev/null
@@ -1,219 +0,0 @@
-FROM debian:bullseye
-
-RUN sed -i 's/main.*/main contrib non-free/' /etc/apt/sources.list
-
-RUN apt-get update && apt-get install -y \
- perl \
- libxerces-c3.2 \
- libxerces-c3-dev \
- sqlite3 \
- file \
- libalgorithm-diff-xs-perl \
- libany-moose-perl \
- libapache-session-perl \
- libarchive-zip-perl \
- libclass-accessor-perl \
- libclass-c3-perl \
- libclass-data-accessor-perl \
- libclass-data-inheritable-perl \
- libclass-errorhandler-perl \
- libclass-load-perl \
- libcommon-sense-perl \
- libcompress-raw-zlib-perl \
- libconfig-auto-perl \
- libconfig-inifiles-perl \
- libconfig-tiny-perl \
- libcrypt-openssl-random-perl \
- libcrypt-openssl-rsa-perl \
- libcrypt-ssleay-perl \
- libdata-optlist-perl \
- libdata-page-perl \
- libdate-calc-perl \
- libdate-manip-perl \
- libdbd-mock-perl \
- libdbd-mysql-perl \
- libdbd-sqlite3-perl \
- libdevel-globaldestruction-perl \
- libdigest-sha-perl \
- libemail-date-format-perl \
- libencode-locale-perl \
- liberror-perl \
- libeval-closure-perl \
- libexcel-writer-xlsx-perl \
- libfcgi-perl \
- libfcgi-procmanager-perl \
- libfile-listing-perl \
- libfile-slurp-perl \
- libfilesys-df-perl \
- libgeo-ip-perl \
- libhtml-parser-perl \
- libhtml-tree-perl \
- libhttp-browserdetect-perl \
- libhttp-cookies-perl \
- libhttp-daemon-perl \
- libhttp-date-perl \
- libhttp-dav-perl \
- libhttp-message-perl \
- libhttp-negotiate-perl \
- libimage-exiftool-perl \
- libimage-info-perl \
- libimage-size-perl \
- libinline-perl \
- libio-html-perl \
- libio-socket-ssl-perl \
- libio-string-perl \
- libipc-run-perl \
- libjson-perl \
- libjson-pp-perl \
- libjson-xs-perl \
- liblist-compare-perl \
- liblist-moreutils-perl \
- liblog-log4perl-perl \
- liblwp-authen-oauth2-perl \
- liblwp-mediatypes-perl \
- libmail-sendmail-perl \
- libmailtools-perl \
- libmime-lite-perl \
- libmime-types-perl \
- libmodule-implementation-perl \
- libmodule-runtime-perl \
- libmoose-perl \
- libmouse-perl \
- libmro-compat-perl \
- libnet-dns-perl \
- libnet-http-perl \
- libnet-libidn-perl \
- libnet-oauth-perl \
- libnet-ssleay-perl \
- libpackage-deprecationmanager-perl \
- libpackage-stash-perl \
- libparse-recdescent-perl \
- libplack-perl \
- libpod-simple-perl \
- libproc-processtable-perl \
- libreadonly-perl \
- libreadonly-xs-perl \
- libroman-perl \
- libsoap-lite-perl \
- libspreadsheet-writeexcel-perl \
- libsub-exporter-progressive-perl \
- libsub-name-perl \
- libtemplate-perl \
- libterm-readkey-perl \
- libterm-readline-gnu-perl \
- libtest-requiresinternet-perl \
- libtest-simple-perl \
- libtie-ixhash-perl \
- libtimedate-perl \
- libtry-tiny-perl \
- libuniversal-require-perl \
- liburi-encode-perl \
- libuuid-perl \
- libuuid-tiny-perl \
- libversion-perl \
- libwww-perl \
- libwww-robotrules-perl \
- libxml-dom-perl \
- libxml-libxml-perl \
- libxml-libxslt-perl \
- libxml-sax-perl \
- libxml-simple-perl \
- libxml-writer-perl \
- libyaml-appconfig-perl \
- libyaml-libyaml-perl \
- libyaml-perl \
- libmarc-record-perl \
- libmarc-xml-perl
-
-RUN apt-get install -y \
- autoconf \
- bison \
- build-essential \
- git \
- libdevel-cover-perl \
- libffi-dev \
- libgdbm-dev \
- libncurses5-dev \
- libreadline6-dev \
- libsqlite3-dev \
- libssl-dev \
- libyaml-dev \
- openssh-server \
- unzip \
- wget \
- zip \
- zlib1g-dev \
- netcat \
- libperl-critic-perl
-
-RUN apt-get install -y libtest-class-perl libswitch-perl libtest-spec-perl libtest-mockobject-perl
-
-RUN apt-get install -y apache2 apache2-utils vim
-
-RUN cpan -T \
- File::Pairtree \
- URI::Escape \
- CGI::PSGI \
- IP::Geolocation::MMDB
-
-WORKDIR htapps/babel/geoip
-RUN wget https://github.com/maxmind/MaxMind-DB/blob/main/test-data/GeoIP2-Country-Test.mmdb?raw=true -O GeoIP2-Country.mmdb
-
-RUN ln -s /tmp /ram
-
-RUN mkdir -p /l/local/bin
-RUN ln -s /usr/bin/unzip /l/local/bin/unzip
-RUN ln -s /usr/bin/plackup /l/local/bin/plackup
-
-WORKDIR /tmp
-COPY ./imgsrv/vendor/KDU7A2_Demo_Apps_for_Ubuntu-x86-64_170827.zip /tmp
-RUN unzip -j -d /tmp/kakadu KDU7A2_Demo_Apps_for_Ubuntu-x86-64_170827.zip
-# RUN wget https://kakadusoftware.com/wp-content/uploads/2014/06/KDU7A2_Demo_Apps_for_Ubuntu-x86-64_170827.zip
-# RUN unzip -j -d kakadu KDU7A2_Demo_Apps_for_Ubuntu-x86-64_170827.zip
-RUN mv /tmp/kakadu/*.so /usr/local/lib
-RUN mv /tmp/kakadu/kdu* /usr/local/bin
-RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/kakadu.conf
-RUN ldconfig
-
-RUN mkdir -p /l/local/bin
-RUN ln -s /usr/bin/convert /l/local/bin/convert
-RUN ln -s /usr/local/bin/kdu_expand /l/local/bin/kdu_expand
-RUN ln -s /usr/local/bin/kdu_compress /l/local/bin/kdu_compress
-RUN /bin/bash -c 'for cmd in pamflip jpegtopnm tifftopnm bmptopnm pngtopam ppmmake pamcomp pnmscalefixed pamscale pnmrotate pnmpad pamtotiff pnmtotiff pnmtojpeg pamrgbatopng ppmtopgm pnmtopng; do ln -s /usr/bin/$cmd /l/local/bin; done'
-
-WORKDIR /htapps/babel/cache
-RUN mkdir imgsrv
-RUN chown -R www-data .
-RUN chmod -R 4777 .
-
-WORKDIR /htapps/babel/logs
-RUN chown -R www-data .
-RUN chmod -R 4777 .
-
-COPY ./mdp-lib /htapps/babel/mdp-lib
-COPY ./plack-lib /htapps/babel/plack-lib
-COPY ./slip-lib /htapps/babel/slip-lib
-COPY ./mdp-web /htapps/babel/mdp-web
-
-WORKDIR /htapps/babel/pt
-RUN ln -s /htapps/babel /htapps/test.babel
-
-COPY ./pt /htapps/babel/pt
-RUN echo -e "debug_local = 1\ndebug_enabled = 1\nmdpitem_use_cache=false\n" > lib/Config/local.conf
-RUN chgrp -R www-data /htapps/babel/pt
-
-WORKDIR /htapps/babel/imgsrv
-COPY ./imgsrv /htapps/babel/imgsrv
-RUN echo -e "debug_local=1\ndebug_enabled=1\nmdpitem_use_cache=false\n" > lib/Config/local.conf
-RUN chgrp -R www-data /htapps/babel/imgsrv
-
-RUN ln -s /etc/apache2/mods-available/rewrite.load /etc/apache2/mods-enabled
-RUN ln -s /etc/apache2/mods-available/cgi.load /etc/apache2/mods-enabled
-RUN ln -s /etc/apache2/mods-available/proxy.load /etc/apache2/mods-enabled
-RUN ln -s /etc/apache2/mods-available/proxy_fcgi.load /etc/apache2/mods-enabled
-RUN ln -s /etc/apache2/mods-available/proxy_http.load /etc/apache2/mods-enabled
-
-COPY ./babel-local-dev/docker/000-default.conf /etc/apache2/sites-enabled/000-default.conf
-
-# CMD [ "/usr/sbin/apache2", "-D", "FOREGROUND"]
-CMD [ "/usr/sbin/apache2ctl", "-D", "FOREGROUND" ]
diff --git a/README.md b/README.md
index 6c3d1b93b..362176a98 100644
--- a/README.md
+++ b/README.md
@@ -8,34 +8,28 @@ Clone all the repositories in a working directory.
We're going to be running docker from this working directory,
so `babel-local-dev` has access to the other repositories.
-There's a lot, because we're replicating running on the
-dev servers with `debug_local=1` enabled.
-
-```
-$ mkdir workdir
-$ cd workdir
-$ git clone git@github.com:hathitrust/babel-local-dev.git
-$ git clone git@github.com:hathitrust/catalog.git
-$ git clone git@github.com:hathitrust/common.git
-$ git clone git@github.com:hathitrust/imgsrv.git
-$ git clone git@github.com:hathitrust/pt.git
-$ git clone git@github.com:hathitrust/mdp-lib.git
-$ git clone git@github.com:hathitrust/slip-lib.git
-$ git clone git@github.com:hathitrust/plack-lib.git
-$ git clone git@github.com:hathitrust/imgsrv-sample-data.git
-# more to come
+First clone this repository:
+```bash
+git clone git@github.com:hathitrust/babel-local-dev.git babel
```
-## Step 2: intialize all the submodules
+Then run:
-*Insert fancy one liner if available.*
+```bash
+cd babel
+./setup.sh
+```
+
+This will check out the other repositories along with their submodules.
+There's a lot, because we're replicating running on the dev servers with
+`debug_local=1` enabled.
## Step 3: build the `babel-local-dev` environment
In your workdir:
```
-docker-compose -f ./babel-local-dev/docker-compose.yml build
+docker-compose build
```
## Step 4: run `babel-local-dev`:
@@ -43,29 +37,83 @@ docker-compose -f ./babel-local-dev/docker-compose.yml build
In your workdir:
```
-docker-compose -f ./babel-local-dev/docker-compose.yml up
+docker-compose up
```
In your browser:
-* http://localhost:8080/Search/Home
-* http://localhost:8080/cgi/pt?id=test.pd_open
+* catalog: `http://localhost:8080/Search/Home`
+* catalog solr: `http://localhost:9033`
+* full-text solr: `http://localhost:8983`
+
+PageTurner & imgsrv:
+* `http://localhost:8080/cgi/pt?id=test.pd_open`
+* `http://localhost:8080/cgi/imgsrv/cover?id=test.pd_open`
+* `http://localhost:8080/cgi/imgsrv/image?id=test.pd_open&seq=1`
+* `http://localhost:8080/cgi/imgsrv/html?id=test.pd_open&seq=1`
+* `http://localhost:8080/cgi/imgsrv/download/pdf?id=test.pd_open&seq=1&attachment=0`
+
+mysql is exposed at 127.0.0.1:3307. The default username & password with write
+access is `mdp-admin` / `mdp-admin` (needless to say, do not use this image in
+production!)
+
+```bash
+mysql -h 127.0.0.1 -p 3307 -u mdp-admin -p
+```
Huzzah!
+Not yet configured:
+* `http://localhost:8080/cgi/mb`
+* `http://localhost:8080/cgi/ls`
+* `http://localhost:8080/cgi/whoami`
+* `http://localhost:8080/cgi/ping`
+* etc
+
## How this works (for now)
-The `docker-commpose` provides a custom catalog configuration to the `nginx` service to
-proxy `babel` CGI requests to the `apache-cgi` service, and serve `common` requests from
-the local `common` checkout.
+* catalog runs nginx + php
+* babel cgi apps run under apache in a single container
+* imgsrv plack/psgi process runs in its own container
+
+## Staging an Item
-`apache-cgi` is there because `nginx` can only speak FastCGI/HTTP and running *all* the babel
-apps under FastCGI/HTTP is still aspirational.
+First, get a HathiTrust ZIP and METS. The easiest way to do this is probably by
+using the [Data API client](https://babel.hathitrust.org/cgi/htdc) to download
+a public domain item unencumbered by any contractual restrictions, for example
+`uc2.ark:/13960/t4mk66f1d`. Select "Download" and in turn select "Item METS
+file" and "entire item" and submit the form; this will download the ZIP and
+METS respectively.
+
+Running the stage item script requires a Ruby runtime. It will automate putting
+the item in the appropriate location under `imgsrv-sample-data`, fetch the
+bibliographic data, and extract and index the full text.
+
+First make sure all the dependencies are running:
+
+```bash
+docker-compose build
+docker-compose up
+```
+
+Then, install dependencies for the `stage-item` script and run it with the
+downloaded zip and METS:
+
+```bash
+cd stage-item
+bundle config set --local path 'vendor/bundle'
+bundle install
+bundle exec ruby stage_item.rb uc2.ark:/13960/t4mk66f1d ark+=13960=t4mk66f1d.zip ark+=13960=t4mk66f1d.mets.xml
+```
+
+Note that the zip and METS must be named as they are in the actual
+repository -- if you name them "foo.zip" or "foo.xml" they will not be renamed,
+and full-text indexing and PageTurner will not be able to find the item.
## TODO
-- [ ] merge the `imgsrv` DEV-231-grok branch and update the `Dockerfile`s to include `grok`
-- [ ] update `slip-lib/Searcher.pm` to set `wt=xml` because the new solr defaults return JSON
-- [ ] adding `pt` requires filling out more of the `ht_web` tables (namely `mb_*`)
+- [ ] add `mb` and `ls`
+- [ ] ensure database user can write to relevant tables
+- [ ] link to documentation for important tasks - e.g. running apps under debugging, updating css/js, etc
- [ ] easy mechanism to generate placeholder volumes in `imgsrv-sample-data` that correspond to the records in the catalog
-
+- [ ] make it easier to fetch real volumes
diff --git a/cache/.keep b/cache/.keep
new file mode 100644
index 000000000..e69de29bb
diff --git a/docker-compose.yml b/docker-compose.yml
index 82dc2e360..3ceeb1753 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -8,93 +8,144 @@ services:
- "8080:8080"
volumes:
- ./docker/nginx-default.conf:/etc/nginx/conf.d/default.conf
- - ../catalog:/app
- - ../common:/app/common-dev
+ - ./catalog:/app
+ - ./common:/app/common-dev
depends_on:
- vufind
- imgsrv
+ - apache-cgi
- apache-cgi:
- build:
- context: ".."
- dockerfile: "./babel-local-dev/Dockerfile"
+ imgsrv:
+ build: ./docker/babel-base
+ user: ${CURRENT_USER}
+ image: hathitrust-babel-base
volumes:
- - ../catalog:/htapps/app
- - ../common:/htapps/babel/common
- - ../imgsrv:/htapps/babel/imgsrv
- - ../pt:/htapps/babel/pt
- - ../mdp-web:/htapps/babel/mdp-web
- - ../mdp-lib:/htapps/babel/mdp-lib
- - ../slip-lib:/htapps/babel/slip-lib
- - ../plack-lib:/htapps/babel/plack-lib
- - "../imgsrv-sample-data/sdr1:/sdr1"
- - "../imgsrv-sample-data/etc:/htapps/babel/etc"
- - "../imgsrv-sample-data/watermarks:/htapps/babel/watermarks"
- - "../imgsrv-sample-data:/tmp/imgsrv-sample-data"
+ - ./imgsrv:/htapps/babel/imgsrv
+ - ./logs:/htapps/babel/logs
+ - ./cache:/htapps/babel/cache
+ - "./sample-data/sdr1:/sdr1"
+ - "./sample-data/etc:/htapps/babel/etc"
+ - "./sample-data/watermarks:/htapps/babel/watermarks"
+ - "./sample-data:/tmp/sample-data"
environment:
- SDRROOT=/htapps/babel
- SDRDATAROOT=/sdr1
- - HT_DEV=
- - MARIADB_USER=ht_web
+ - HT_DEV=docker
- REMOTE_ADDR=127.0.0.1
- HTTP_HOST=127.0.0.1
+ - APACHE_LOG_DIR=/tmp
depends_on:
- mysql-sdr
- solr-sdr-catalog
ports:
- - "41028:41028"
+ - "31028:31028"
+ command: bash -c "/htapps/babel/imgsrv/bin/startup_imgsrv"
- # assumes imgsrv-sample-data has been checked out next to "imgsrv"
- imgsrv:
- build: ../imgsrv
+ apache-cgi:
+ build: ./docker/apache-cgi
+ user: ${CURRENT_USER}
volumes:
- - ../imgsrv:/htapps/babel/imgsrv
- - "../imgsrv-sample-data/sdr1:/sdr1"
- - "../imgsrv-sample-data/etc:/htapps/babel/etc"
- - "../imgsrv-sample-data/watermarks:/htapps/babel/watermarks"
- - "../imgsrv-sample-data:/tmp/imgsrv-sample-data"
+ - "./docker/apache-cgi/000-default.conf:/etc/apache2/sites-enabled/000-default.conf"
+ - ./imgsrv:/htapps/babel/imgsrv
+ - ./logs:/htapps/babel/logs
+ - ./cache:/htapps/babel/cache
+ - ./pt:/htapps/babel/pt
+ - ./ssd:/htapps/babel/ssd
+ - ./common:/htapps/babel/common
+ - "./sample-data/sdr1:/sdr1"
+ - "./sample-data/etc:/htapps/babel/etc"
+ - "./sample-data/watermarks:/htapps/babel/watermarks"
+ - "./sample-data:/tmp/sample-data"
environment:
- SDRROOT=/htapps/babel
- SDRDATAROOT=/sdr1
- - HT_DEV=
- - MARIADB_USER=ht_web
- - REMOTE_ADDR=127.0.0.1
- - HTTP_HOST=127.0.0.1
depends_on:
- mysql-sdr
- solr-sdr-catalog
ports:
- - "31028:31028"
- command: bash -c "/htapps/babel/imgsrv/bin/startup_imgsrv"
+ - "41028:41028"
+
+ vufind:
+ build: ./catalog
+ hostname: 'catalog-dev'
+ volumes:
+ - ./catalog:/app
+ depends_on:
+ - mysql-sdr
+ - solr-sdr-catalog
+
+#### DATA STORES
solr-sdr-catalog:
image: ghcr.io/hathitrust/catalog-solr-sample
ports:
- "9033:9033"
+ solr-lss-dev:
+ image: solr:6
+ ports:
+ - "8983:8983"
+ user: ${CURRENT_USER}
+ volumes:
+ - ./lss_solr_configs/lss-dev/core-x:/opt/solr/server/solr/core-x
+ - ./lss_solr_configs/lss-dev/core-y:/opt/solr/server/solr/core-y
+ - ./lss_solr_configs:/opt/lss_solr_configs
+ - ./lss_solr_configs/lib:/opt/solr/server/solr/lib
+ - ./logs/solr:/opt/solr/server/logs
+
mysql-sdr:
- image: mariadb
+ image: ghcr.io/hathitrust/db-image:latest
volumes:
- - ../catalog/docker/vufind.sql:/docker-entrypoint-initdb.d/vufind.sql
- - ../imgsrv/vendor/common-lib/lib/sql/000_ht_schema.sql:/docker-entrypoint-initdb.d/0000_ht_schema.sql
- - ../imgsrv/vendor/common-lib/lib/sql/001_ht_ht_namespaces.sql:/docker-entrypoint-initdb.d/001_ht_ht_namespaces.sql
- - ../imgsrv/vendor/common-lib/lib/sql/002_ht_rights_current.sql:/docker-entrypoint-initdb.d/002_ht_rights_current.sql
- - ../imgsrv/sql/100_ht_web_schema.sql:/docker-entrypoint-initdb.d/100_ht_web_schema.sql
- - ../imgsrv/sql/200_users.sql:/docker-entrypoint-initdb.d/200_users.sql
+ - ./slip/etc/sql/100_slip.sql:/docker-entrypoint-initdb.d/100_slip.sql
+ - ./catalog/docker/vufind.sql:/docker-entrypoint-initdb.d/101_vufind.sql
+ ports:
+ - "3307:3306"
environment:
- # - MARIADB_RANDOM_ROOT_PASSWORD=1
+ # - mysql-sdr_RANDOM_ROOT_PASSWORD=1
MYSQL_ROOT_PASSWORD: TIMTOWTDIBSCINABTE
- vufind:
- build: ../catalog
- hostname: 'catalog-dev'
+#### INDEXING
+
+ # We add the 'indexing' profile to keep these from starting automatically
+ # when we do 'docker-compose up'
+
+ slip:
+ build: ./slip
+ image: hathitrust-slip
+ user: ${CURRENT_USER}
volumes:
- - ../catalog:/app
+ - ./slip:/htapps/babel/slip
+ - ./logs:/htapps/babel/logs
+ # this is where docs-j saves output
+ - ./slip/sample:/htapps/babel/logs/tmp
+ - "./sample-data/sdr1:/sdr1"
+ - "./sample-data/etc:/htapps/babel/etc"
+ - "./sample-data:/tmp/sample-data"
+ environment:
+ - SDRROOT=/htapps/babel
+ - SDRDATAROOT=/sdr1
+ - HT_DEV=docker
depends_on:
- mysql-sdr
- solr-sdr-catalog
+ command: bash
+ profiles:
+ - indexing
+
+ traject:
+ build: ./hathitrust_catalog_indexer
+ user: ${CURRENT_USER}
+ environment:
+ - SOLR_URL=http://solr-sdr-catalog:9033/solr/catalog
+ - redirect_file=/dev/null
+ - NO_DB=1
+ - DDIR=/app/metadata
+ depends_on:
+ - solr-sdr-catalog
+ volumes:
+ - "./sample-data/metadata:/app/metadata"
+ profiles:
+ - indexing
-volumes:
- # sdr1:
- data_db:
+ # todo: ingest, bound to sample-data
diff --git a/docker/000-default.conf b/docker/000-default.conf
deleted file mode 100644
index a23ff5ec2..000000000
--- a/docker/000-default.conf
+++ /dev/null
@@ -1,75 +0,0 @@
-Listen 41028
-
- # The ServerName directive sets the request scheme, hostname and port that
- # the server uses to identify itself. This is used when creating
- # redirection URLs. In the context of virtual hosts, the ServerName
- # specifies what hostname must appear in the request's Host: header to
- # match this virtual host. For the default virtual host (this file) this
- # value is not decisive as it is used as a last resort host regardless.
- # However, you must set it for any further virtual host explicitly.
- #ServerName www.example.com
-
- ServerAdmin webmaster@localhost
- DocumentRoot /htapps/babel
-
- # Available loglevels: trace8, ..., trace1, debug, info, notice, warn,
- # error, crit, alert, emerg.
- # It is also possible to configure the loglevel for particular
- # modules, e.g.
- #LogLevel info ssl:warn
-
- # ErrorLog ${APACHE_LOG_DIR}/error.log
- # CustomLog ${APACHE_LOG_DIR}/access.log combined
-
- LogLevel trace8
- ErrorLog /dev/stderr
- CustomLog /dev/stdout combined
-
- # For most configuration files from conf-available/, which are
- # enabled or disabled at a global level, it is possible to
- # include a line for only one particular virtual host. For example the
- # following line enables the CGI configuration for this host only
- # after it has been globally disabled with "a2disconf".
-
- Include conf-available/serve-cgi-bin.conf
- RewriteEngine On
-
- ## SetEnv/SetEnvIf for environment variables
- SetEnv SDRROOT /htapps/babel
- SetEnv SDRDATAROOT /sdr1
- SetEnv ASSERTION_EMAIL hathitrust-system@umich.edu
- SetEnv HT_DEV www-data
- # SetEnv HT_IGNORE_GEOIP true
-
- # SetEnv PTSEARCH_SOLR https://testing.ptsearch.kubernetes.hathitrust.org:8443/solr/ptsearch
- # SetEnv PTSEARCH_SOLR_BASIC_AUTH c29scjpwY1hoMVQxTVF4eExoRUNjSVZPME43MDc2Vk1WdzdUYms=
-
-
-
- Require not env badrobot
- Require not env loadbalancer
- Require all granted
-
-
-
-
- Options +ExecCGI
- SetHandler cgi-script
-
-
- RewriteCond %{DOCUMENT_ROOT}/babel/$1/web/$2 -f
- RewriteRule ^/([^/]+)/(.*) /$1/web/$2 [last]
-
- RewriteCond %{DOCUMENT_ROOT}/babel/$1/web/ -d
- RewriteRule ^/([^/]+)/?$ /$1/web/ [last]
-
- RewriteCond %{DOCUMENT_ROOT}/babel/$2/cgi/$3 -f
- RewriteRule ^/(shcgi|cgi)/([^/]+)/([^/]+)(.*)$ /$2/cgi/$3$4 [skip]
-
- RewriteCond %{DOCUMENT_ROOT}/babel/$2/cgi/$2 -f
- RewriteRule ^/(shcgi|cgi)/([^/]+)(.*)$ /$2/cgi/$2$3
-
- RewriteCond %{DOCUMENT_ROOT}/babel/$1/cgi/$3.choke -f
- RewriteRule ^/([^/]+)/(shcgi|cgi)/([^/]+)(.*)$ /$1/cgi/$3.choke$4 [last]
-
-
\ No newline at end of file
diff --git a/docker/apache-cgi/000-default.conf b/docker/apache-cgi/000-default.conf
new file mode 100644
index 000000000..318f32a94
--- /dev/null
+++ b/docker/apache-cgi/000-default.conf
@@ -0,0 +1,54 @@
+ServerName apache-cgi
+ErrorLog /dev/stdout
+CustomLog /dev/stdout combined
+Listen 41028
+
+
+ ServerAdmin hathitrust@localhost
+ DocumentRoot /htapps/babel
+
+ LogLevel debug
+ ErrorLog /dev/stdout
+ CustomLog /dev/stdout combined
+
+ RewriteEngine On
+
+ ## SetEnv/SetEnvIf for environment variables
+ SetEnv SDRROOT /htapps/babel
+ SetEnv SDRDATAROOT /sdr1
+ SetEnv ASSERTION_EMAIL hathitrust-system@umich.edu
+ SetEnv HT_DEV www-data
+ # SetEnv HT_IGNORE_GEOIP true
+
+ # SetEnv PTSEARCH_SOLR https://testing.ptsearch.kubernetes.hathitrust.org:8443/solr/ptsearch
+ # SetEnv PTSEARCH_SOLR_BASIC_AUTH c29scjpwY1hoMVQxTVF4eExoRUNjSVZPME43MDc2Vk1WdzdUYms=
+
+
+ Options Indexes FollowSymLinks
+ AllowOverride All
+ Require all granted
+
+
+
+ Options +ExecCGI
+ SetHandler cgi-script
+
+
+ RewriteCond %{DOCUMENT_ROOT}/$1/web/$2 -f
+ RewriteRule ^/([^/]+)/(.*) /$1/web/$2 [last]
+
+ RewriteCond %{DOCUMENT_ROOT}/$1/web/ -d
+ RewriteRule ^/([^/]+)/?$ /$1/web/ [last]
+
+ RewriteCond %{DOCUMENT_ROOT}/$2/cgi/$3 -f
+ RewriteRule ^/(cgi)/([^/]+)/([^/]+)(.*)$ /$2/cgi/$3$4 [skip]
+
+ RewriteCond %{DOCUMENT_ROOT}/$2/cgi/$2 -f
+ RewriteRule ^/(cgi)/([^/]+)(.*)$ /$2/cgi/$2$3
+
+ RewriteCond %{DOCUMENT_ROOT}/$1/cgi/$3.choke -f
+ RewriteRule ^/([^/]+)/(cgi)/([^/]+)(.*)$ /$1/cgi/$3.choke$4 [last]
+
+ # TODO include for auth env vars
+
+
diff --git a/docker/apache-cgi/Dockerfile b/docker/apache-cgi/Dockerfile
new file mode 100644
index 000000000..d9d5529c6
--- /dev/null
+++ b/docker/apache-cgi/Dockerfile
@@ -0,0 +1,21 @@
+FROM hathitrust-babel-base
+
+RUN apt-get -y install apache2 libapache2-mod-fcgid
+
+RUN a2dissite '*'
+RUN a2disconf other-vhosts-access-log
+RUN a2dismod 'mpm_*'
+RUN a2enmod headers \
+ mpm_prefork \
+ rewrite \
+ proxy \
+ proxy_fcgi \
+ proxy_http \
+ cgi
+
+COPY 000-default.conf /etc/apache2/sites-enabled
+STOPSIGNAL SIGWINCH
+
+COPY apache.sh /
+RUN chmod +x /apache.sh
+ENTRYPOINT ["/apache.sh"]
diff --git a/docker/apache-cgi/apache.sh b/docker/apache-cgi/apache.sh
new file mode 100755
index 000000000..a42d1f1ba
--- /dev/null
+++ b/docker/apache-cgi/apache.sh
@@ -0,0 +1,21 @@
+#! /bin/bash
+
+# Apache gets grumpy about PID files pre-existing
+if [ ! -d /tmp/apache2 ]
+then
+ mkdir -p /tmp/apache2/{run,lock,log}
+fi
+
+rm -f /tmp/apache2/apache2*.pid
+
+export APACHE_PID_FILE=/tmp/apache2/run/apache2.pid
+export APACHE_RUN_DIR=/tmp/apache2/run
+export APACHE_LOCK_DIR=/tmp/apache2/lock
+export APACHE_LOG_DIR=/tmp/apache2/log
+
+# Won't be effective if we pass user from docker-compose; that's OK - hence
+# shenanigans above
+export APACHE_RUN_USER=www-data
+export APACHE_RUN_GROUP=www-data
+
+exec apache2 -DFOREGROUND
diff --git a/docker/babel-base/Dockerfile b/docker/babel-base/Dockerfile
new file mode 100644
index 000000000..b7866ebba
--- /dev/null
+++ b/docker/babel-base/Dockerfile
@@ -0,0 +1,82 @@
+FROM debian:bookworm
+
+# # does not work bookworm - evaluate if it's needed
+# RUN sed -i 's/main.*/main contrib non-free/' /etc/apt/sources.list
+
+RUN apt-get update && apt-get install -y \
+ autoconf \
+ bison \
+ build-essential \
+ cpanminus \
+ curl \
+ file \
+ git \
+ grokj2k-tools \
+ imagemagick \
+ libapache-session-perl \
+ libconfig-tiny-perl \
+ libdate-calc-perl \
+ libdate-manip-perl \
+ libdbd-mysql-perl \
+ libdevel-cover-perl \
+ libfcgi-perl \
+ libfcgi-procmanager-perl \
+ libimage-exiftool-perl \
+ libimage-info-perl \
+ libimage-size-perl \
+ libio-string-perl \
+ libipc-run-perl \
+ libjson-xs-perl \
+ liblist-moreutils-perl \
+ libmailtools-perl \
+ libmime-types-perl \
+ libnet-dns-perl \
+ libplack-perl \
+ libtest-class-perl \
+ libtry-tiny-perl \
+ libxml-libxml-perl \
+ libxml-libxslt-perl \
+ netcat-traditional \
+ netpbm \
+ perl \
+ procps \
+ starman \
+ unzip \
+ uuid-dev \
+ zip \
+ zlib1g-dev
+
+RUN cpanm --notest \
+ File::Pairtree \
+ URI::Escape \
+ CGI::PSGI \
+ IP::Geolocation::MMDB \
+ UUID
+
+WORKDIR /htapps/babel/geoip
+ADD --chmod=644 https://github.com/maxmind/MaxMind-DB/blob/main/test-data/GeoIP2-Country-Test.mmdb?raw=true GeoIP2-Country.mmdb
+
+RUN ln -s /tmp /ram
+
+RUN mkdir -p /l/local/bin
+RUN ln -s /usr/bin/unzip /l/local/bin/unzip
+RUN ln -s /usr/bin/convert /l/local/bin/convert
+RUN ln -s /usr/bin/plackup /l/local/bin/plackup
+RUN /bin/bash -c 'for cmd in pamflip jpegtopnm tifftopnm bmptopnm pngtopam ppmmake pamcomp pnmscalefixed pamscale pnmrotate pnmpad pamtotiff pnmtotiff pnmtojpeg pamrgbatopng ppmtopgm pnmtopng; do ln -s /usr/bin/$cmd /l/local/bin; done'
+
+WORKDIR /htapps/babel/imgsrv
+
+RUN mkdir /htapps/babel/cache
+RUN chmod 4777 /htapps/babel/cache
+
+RUN mkdir /htapps/babel/logs
+RUN chmod 4777 /htapps/babel/logs
+
+RUN ln -s /htapps/babel /htapps/test.babel
+RUN cd /htapps/babel
+
+COPY . /htapps/babel/imgsrv
+RUN ln -s imgsrv/vendor/common-lib/lib ../mdp-lib
+RUN ln -s imgsrv/web/common-web ../mdp-web
+
+CMD ["/htapps/babel/imgsrv/bin/startup_imgsrv"]
diff --git a/docker/nginx-default.conf b/docker/nginx-default.conf
index 417a954fa..1b6b71aaa 100644
--- a/docker/nginx-default.conf
+++ b/docker/nginx-default.conf
@@ -76,9 +76,13 @@ server {
}
location ~ /cgi/imgsrv/(image|thumbnail|meta|html|ocr|cover)(.*) {
- rewrite /cgi/imgsrv/(.*)$ /$1 break;
- proxy_pass http://imgsrv:31028;
- # proxy_pass http://imgsrv:31028/$1$2;
+ fastcgi_split_path_info ^(/cgi/imgsrv)(/.+)$;
+ # try_files $fastcgi_script_name =404;
+ set $path_info $fastcgi_path_info;
+ fastcgi_param PATH_INFO $path_info;
+ fastcgi_pass imgsrv:31028;
+ fastcgi_param SCRIPT_FILENAME $fastcgi_script_name;
+ include fastcgi_params;
}
location /cgi/imgsrv {
@@ -96,6 +100,16 @@ server {
proxy_pass http://apache-cgi:41028/pt;
}
+ location /cgi/ssd {
+ # rewrite /cgi/pt/(.*)$ /cgi/pt/$1 break;
+ proxy_pass http://apache-cgi:41028/cgi/ssd;
+ }
+
+ location /ssd {
+ rewrite /ssd/(.*)$ /ssd/$1 break;
+ proxy_pass http://apache-cgi:41028/ssd;
+ }
+
location / {
try_files $uri @rewrite;
}
diff --git a/logs/.keep b/logs/.keep
new file mode 100644
index 000000000..e69de29bb
diff --git a/logs/solr/.keep b/logs/solr/.keep
new file mode 100644
index 000000000..e69de29bb
diff --git a/setup.sh b/setup.sh
new file mode 100755
index 000000000..c398a5b03
--- /dev/null
+++ b/setup.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+cat <> .env
+echo "APACHE_RUN_USER=$(id -u)" >> .env
+echo "APACHE_RUN_GROUP=$(id -g)" >> .env
+
+# Do we need these separately?
+# git clone $GIT_BASE/mdp-lib.git
+# git clone $GIT_BASE/slip-lib.git
+# git clone $GIT_BASE/plack-lib.git
diff --git a/stage-item/.env b/stage-item/.env
new file mode 120000
index 000000000..4a82335f5
--- /dev/null
+++ b/stage-item/.env
@@ -0,0 +1 @@
+../.env
\ No newline at end of file
diff --git a/stage-item/Gemfile b/stage-item/Gemfile
new file mode 100644
index 000000000..04cc7d17a
--- /dev/null
+++ b/stage-item/Gemfile
@@ -0,0 +1,19 @@
+# frozen_string_literal: true
+
+source "https://rubygems.org"
+
+git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
+
+# gem "rails"
+
+gem "marc", "~> 1.2"
+gem "faraday", "~> 2.7"
+gem "faraday-follow_redirects"
+gem "ht-pairtree", github: "hathitrust/ht-pairtree", tag: 'v0.1.0'
+gem "mysql2"
+gem "sequel"
+
+group :development do
+ gem "pry"
+ gem "pry-byebug"
+end
diff --git a/stage-item/Gemfile.lock b/stage-item/Gemfile.lock
new file mode 100644
index 000000000..68f602c2b
--- /dev/null
+++ b/stage-item/Gemfile.lock
@@ -0,0 +1,55 @@
+GIT
+ remote: https://github.com/hathitrust/ht-pairtree
+ revision: a8620c464438c8de7ad9a492b1fd5bd7f0756079
+ tag: v0.1.0
+ specs:
+ ht-pairtree (0.1.0)
+ pairtree (~> 0.3)
+
+GEM
+ remote: https://rubygems.org/
+ specs:
+ byebug (11.1.3)
+ coderay (1.1.3)
+ faraday (2.7.4)
+ faraday-net_http (>= 2.0, < 3.1)
+ ruby2_keywords (>= 0.0.4)
+ faraday-follow_redirects (0.3.0)
+ faraday (>= 1, < 3)
+ faraday-net_http (3.0.2)
+ marc (1.2.0)
+ rexml
+ scrub_rb (>= 1.0.1, < 2)
+ unf
+ method_source (1.0.0)
+ mysql2 (0.5.5)
+ pairtree (0.3.0)
+ pry (0.14.2)
+ coderay (~> 1.1)
+ method_source (~> 1.0)
+ pry-byebug (3.10.1)
+ byebug (~> 11.0)
+ pry (>= 0.13, < 0.15)
+ rexml (3.2.5)
+ ruby2_keywords (0.0.5)
+ scrub_rb (1.0.1)
+ sequel (5.66.0)
+ unf (0.1.4)
+ unf_ext
+ unf_ext (0.0.8.2)
+
+PLATFORMS
+ x86_64-linux
+
+DEPENDENCIES
+ faraday (~> 2.7)
+ faraday-follow_redirects
+ ht-pairtree!
+ marc (~> 1.2)
+ mysql2
+ pry
+ pry-byebug
+ sequel
+
+BUNDLED WITH
+ 2.2.22
diff --git a/stage-item/stage_item.rb b/stage-item/stage_item.rb
new file mode 100644
index 000000000..b99cd3dc5
--- /dev/null
+++ b/stage-item/stage_item.rb
@@ -0,0 +1,160 @@
+#!ruby
+
+require "faraday"
+require "faraday/follow_redirects"
+require "fileutils"
+require "ht/pairtree"
+require "marc"
+require "sequel"
+require "tempfile"
+
+# The parent of babel-local-dev where all the HathiTrust repos are checked out
+HTDEV_ROOT = ENV["HTDEV_ROOT"] || File.realpath(File.join(__dir__,".."))
+
+METADATA_ROOT = ENV["METADATA_ROOT"] || File.join(HTDEV_ROOT,"sample-data","metadata")
+SDRDATAROOT = ENV["SDRDATAROOT"] || File.join(HTDEV_ROOT,"sample-data","sdr1")
+CATALOG_BASE = ENV["CATALOG_BASE"] || "https://catalog.hathitrust.org"
+MYSQL_URL = ENV["MYSQL_URL"] || "mysql2://mdp-admin:mdp-admin@127.0.0.1:3307/ht"
+CATALOG_SOLR = ENV["CATALOG_SOLR"] || "http://localhost:9033/solr/catalog"
+LSS_SOLR = ENV["LSS_SOLR"] || "http://localhost:8983/solr/core-x"
+
+class StageItem
+ attr_reader :htid, :namespace, :objid, :zip, :mets, :pt
+
+ def self.main
+ usage unless ARGV.length == 3
+
+ StageItem.new(*ARGV).run
+ end
+
+ def initialize(htid, zip, mets)
+ @htid = htid
+ (@namespace, @objid) = htid.split(".",2)
+ @zip = zip
+ @mets = mets
+
+ self.class.usage unless [@zip, @mets].all? { |f| File.exist?(f) } &&
+ @zip.match?(/\.zip$/) && @mets.match?(/\.xml$/)
+
+ @pt = HathiTrust::Pairtree.new(root: File.join(SDRDATAROOT,'obj'))
+ end
+
+ def run
+ stage_content
+ stage_metadata
+ index_full_text
+ end
+
+ def stage_metadata
+ Tempfile.create(["metadata", ".json"], File.realpath(METADATA_ROOT)) do |f|
+ metadata = fetch_metadata(f)
+ populate_database(metadata)
+ index_metadata(File.basename(f.path))
+ end
+ end
+
+ def stage_content
+ pt.create(htid, new_namespace_allowed: true)
+ repo_path = pt.path_for(htid)
+
+ puts("↪️ Copying zip and mets to repo #{repo_path}\n")
+ FileUtils.cp([zip,mets],repo_path)
+ end
+
+ def fetch_metadata(tempfile)
+ url = "/Record/HTID/#{htid}.json"
+ puts "📙 Getting metadata #{CATALOG_BASE}#{url} and saving to tempfile #{tempfile.path}\n"
+
+ conn = Faraday.new(CATALOG_BASE) do |f|
+ f.response :follow_redirects
+ end
+
+ json = conn.get("/Record/HTID/#{htid}.json").body
+
+ tempfile.write(json)
+ tempfile.flush
+
+ MARC::Record.new_from_hash(JSON.parse(json))
+ end
+
+ def populate_database(record)
+ catalog_id = record["001"].value
+
+ # each item has a 974 field; the HTID is in 974$u
+ item_data = record.fields("974").find { |f| f["u"] == htid }
+ raise "Can't find item data for #{htid} in record #{catalog_id}" unless item_data
+
+ rights_attr = item_data["r"]
+ rights_reason = item_data["q"]
+ rights_source = item_data["s"]
+ zephir_update_date = item_data["d"]
+
+ # Simplification for the purposes of testing data: for now, the access
+ # profile is 2 ('google') if the item was digitized by google and 1
+ # ('open') otherwise. We can add options later to override all the rights
+ # stuff for purposes of testing.
+ access_profile = (rights_source == 'google') ? 2 : 1
+
+ dbh = Sequel.connect(MYSQL_URL)
+
+ sql = <<~SQL
+ REPLACE INTO rights_current (namespace, id, attr, reason, source, access_profile, user, note) VALUES
+ (?, ?,
+ (SELECT id FROM attributes WHERE name = ?),
+ (SELECT id FROM reasons WHERE name = ?),
+ (SELECT id FROM sources WHERE name = ?),
+ ?,'stage-item','staged from catalog record by stage_item.rb')
+ SQL
+
+ values = [namespace, objid, rights_attr, rights_reason, rights_source, access_profile]
+ dbh[sql,*values].insert
+
+ sql = <<~SQL
+ REPLACE INTO slip_rights (nid, attr, reason, source, user, time, sysid, update_time)
+ SELECT concat(namespace, '.', id), attr, reason, source, user, time, ?, ? FROM rights_current WHERE namespace = ? and id = ?
+ SQL
+
+ values = [catalog_id, zephir_update_date, namespace, objid]
+ dbh[sql,*values].insert
+ end
+
+ def index_metadata(file)
+ puts "📕 Indexing metadata..."
+
+ catalog_utils_sh = File.join(HTDEV_ROOT,"hathitrust_catalog_indexer","bin","utils.sh")
+ system("docker-compose run traject bin/index_file metadata/#{file}")
+ system("bash -c 'source #{catalog_utils_sh}; solr_url; commit'")
+ end
+
+ def index_full_text
+ puts "📖 Indexing full text..."
+
+ system("docker-compose run slip index/docs-j -r11 -I#{htid}")
+
+ slip_sample_dir = File.join(HTDEV_ROOT,"slip","sample")
+ load_into_solr_sh = File.join(HTDEV_ROOT,"slip","sample","load_into_solr.sh")
+ pt_objid = File.basename(mets,".mets.xml")
+
+ system("bash #{slip_sample_dir}/load_into_solr.sh #{slip_sample_dir}/#{pt_objid}*.solr.xml")
+ end
+
+ def self.usage
+ STDERR.puts <<~EOT
+ Usage: $0 namespace.barcode some_item.zip some_item.mets.xml
+
+ where htid is something like "namespace.objid".
+
+ Stages an item into the sample repository from a given zip and XML file. It:
+ * fetches metadata from the catalog
+ * indexes this into the sample catalog
+ * populates the rights_current and slip_rights table
+ * indexes the full text
+ EOT
+
+ exit 1
+ end
+end
+
+if $0 == __FILE__
+ StageItem.main
+end