From 1aff234cf6053bf2e5318f1f10a1b593b88828be Mon Sep 17 00:00:00 2001 From: Heiko Joerg Schick Date: Wed, 30 Aug 2023 04:09:41 +0200 Subject: [PATCH 1/5] Added initial Docker files --- .dockerignore | 34 ++++++++++++++++++++++++++++++ Dockerfile | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ compose.yaml | 49 +++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 10 ++++----- 4 files changed, 142 insertions(+), 5 deletions(-) create mode 100644 .dockerignore create mode 100644 Dockerfile create mode 100644 compose.yaml diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..3edb0b5 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,34 @@ +# Include any files or directories that you don't want to be copied to your +# container here (e.g., local build artifacts, temporary files, etc.). +# +# For more help, visit the .dockerignore file reference guide at +# https://docs.docker.com/engine/reference/builder/#dockerignore-file + +**/.DS_Store +**/__pycache__ +**/.venv +**/.classpath +**/.dockerignore +**/.env +**/.git +**/.gitignore +**/.project +**/.settings +**/.toolstarget +**/.vs +**/.vscode +**/*.*proj.user +**/*.dbmdl +**/*.jfm +**/bin +**/charts +**/docker-compose* +**/compose* +**/Dockerfile* +**/node_modules +**/npm-debug.log +**/obj +**/secrets.dev.yaml +**/values.dev.yaml +LICENSE +README.md diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..ce1f3dd --- /dev/null +++ b/Dockerfile @@ -0,0 +1,54 @@ +# syntax=docker/dockerfile:1 + +# Comments are provided throughout this file to help you get started. +# If you need more help, visit the Dockerfile reference guide at +# https://docs.docker.com/engine/reference/builder/ + +ARG PYTHON_VERSION=3.10.1 +FROM python:${PYTHON_VERSION}-slim as base + +# Prevents Python from writing pyc files. +ENV PYTHONDONTWRITEBYTECODE=1 + +# Keeps Python from buffering stdout and stderr to avoid situations where +# the application crashes without emitting any logs due to buffering. +ENV PYTHONUNBUFFERED=1 + +# Install additional binary packages. +RUN apt-get update && apt-get install -y \ + curl \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Create a non-privileged user that the app will run under. +# See https://docs.docker.com/develop/develop-images/dockerfile_best-practices/#user +ARG UID=10001 +RUN adduser \ + --disabled-password \ + --gecos "" \ + --home "/nonexistent" \ + --shell "/sbin/nologin" \ + --no-create-home \ + --uid "${UID}" \ + appuser + +# Download dependencies as a separate step to take advantage of Docker's caching. +# Leverage a cache mount to /root/.cache/pip to speed up subsequent builds. +# Leverage a bind mount to requirements.txt to avoid having to copy them into +# into this layer. +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,source=requirements.txt,target=requirements.txt \ + python -m pip install -r requirements.txt + +# Switch to the non-privileged user to run the application. +USER appuser + +# Copy the source code into the container. +COPY --chown=appuser . . + +# Expose the port that the application listens on. +EXPOSE 5000 + +# Run the application. +CMD export FLASK_APP=serve.py; flask run --host=0.0.0.0 diff --git a/compose.yaml b/compose.yaml new file mode 100644 index 0000000..94e7d9c --- /dev/null +++ b/compose.yaml @@ -0,0 +1,49 @@ +# Comments are provided throughout this file to help you get started. +# If you need more help, visit the Docker compose reference guide at +# https://docs.docker.com/compose/compose-file/ + +# Here the instructions define your application as a service called "server". +# This service is built from the Dockerfile in the current directory. +# You can add other services your application may depend on here, such as a +# database or a cache. For examples, see the Awesome Compose repository: +# https://github.com/docker/awesome-compose +services: + server: + build: + context: . + ports: + - 5000:5000 + +# The commented out section below is an example of how to define a PostgreSQL +# database that your application can use. `depends_on` tells Docker Compose to +# start the database before your application. The `db-data` volume persists the +# database data between container restarts. The `db-password` secret is used +# to set the database password. You must create `db/password.txt` and add +# a password of your choosing to it before running `docker compose up`. +# depends_on: +# db: +# condition: service_healthy +# db: +# image: postgres +# restart: always +# user: postgres +# secrets: +# - db-password +# volumes: +# - db-data:/var/lib/postgresql/data +# environment: +# - POSTGRES_DB=example +# - POSTGRES_PASSWORD_FILE=/run/secrets/db-password +# expose: +# - 5432 +# healthcheck: +# test: [ "CMD", "pg_isready" ] +# interval: 10s +# timeout: 5s +# retries: 5 +# volumes: +# db-data: +# secrets: +# db-password: +# file: db/password.txt + diff --git a/requirements.txt b/requirements.txt index 304d2b4..a676515 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -feedparser==6.0.8 -Flask==2.0.2 -numpy==1.21.4 -scikit-learn==1.0.1 -sqlitedict==1.7.0 +feedparser +Flask +numpy +scikit-learn +sqlitedict From bee3df79f42fca370bc7641a9c508294c1fc6d23 Mon Sep 17 00:00:00 2001 From: Heiko Joerg Schick Date: Wed, 30 Aug 2023 14:15:14 +0200 Subject: [PATCH 2/5] Minor changes in Docker container --- cronjob | 0 entrypoint.sh | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 cronjob create mode 100644 entrypoint.sh diff --git a/cronjob b/cronjob new file mode 100644 index 0000000..e69de29 diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100644 index 0000000..e69de29 From 381b4ba7ffb31891288eaa7cb81c988541e97a82 Mon Sep 17 00:00:00 2001 From: Heiko Joerg Schick Date: Wed, 30 Aug 2023 14:16:25 +0200 Subject: [PATCH 3/5] Minor changes in Docker container --- Dockerfile | 40 +++++++++++++++++++++++++++++++--------- cron.sh | 10 ++++++++++ cronjob | 24 ++++++++++++++++++++++++ entrypoint.sh | 7 +++++++ requirements.txt | 3 ++- 5 files changed, 74 insertions(+), 10 deletions(-) create mode 100644 cron.sh diff --git a/Dockerfile b/Dockerfile index ce1f3dd..e20bc8c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,9 +14,20 @@ ENV PYTHONDONTWRITEBYTECODE=1 # the application crashes without emitting any logs due to buffering. ENV PYTHONUNBUFFERED=1 -# Install additional binary packages. +# Install required binary packages. +RUN apt-get update && apt-get install -y \ + imagemagick \ + && rm -rf /var/lib/apt/lists/* + +# Add PDF processing to the ImageMagic policy. +RUN sed -i 's///g' /etc/ImageMagick-6/policy.xml + +# DEBUG. Only for debug purposes. RUN apt-get update && apt-get install -y \ curl \ + procps \ + sudo \ + vim \ && rm -rf /var/lib/apt/lists/* WORKDIR /app @@ -24,14 +35,22 @@ WORKDIR /app # Create a non-privileged user that the app will run under. # See https://docs.docker.com/develop/develop-images/dockerfile_best-practices/#user ARG UID=10001 -RUN adduser \ - --disabled-password \ - --gecos "" \ - --home "/nonexistent" \ - --shell "/sbin/nologin" \ - --no-create-home \ - --uid "${UID}" \ - appuser +#RUN adduser \ +# --disabled-password \ +# --gecos "" \ +# --home "/nonexistent" \ +# --shell "/sbin/nologin" \ +# --no-create-home \ +# --uid "${UID}" \ +# appuser + +# DEBUG. Only for debug purposes. +RUN useradd -r -u ${UID} -s /sbin/nologin -d /nonexistent appuser +RUN echo "appuser:12345678" | chpasswd +RUN echo 'appuser ALL=(ALL) NOPASSWD:ALL' | tee -a /etc/sudoers + +# Upgrade pip +RUN python -m pip install --upgrade pip # Download dependencies as a separate step to take advantage of Docker's caching. # Leverage a cache mount to /root/.cache/pip to speed up subsequent builds. @@ -47,6 +66,9 @@ USER appuser # Copy the source code into the container. COPY --chown=appuser . . +# Change file mode(s). +RUN chmod 0744 cron.sh + # Expose the port that the application listens on. EXPOSE 5000 diff --git a/cron.sh b/cron.sh new file mode 100644 index 0000000..8ba2dc1 --- /dev/null +++ b/cron.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +python3 /app/arxiv_daemon.py --num 100 + +if [ $? -eq 0 ]; then + echo "New papers detected! Running compute.py" + python3 /app/compute.py +else + echo "No new papers were added, skipping feature computation" +fi diff --git a/cronjob b/cronjob index e69de29..4223015 100644 --- a/cronjob +++ b/cronjob @@ -0,0 +1,24 @@ +# Edit this file to introduce tasks to be run by cron. +# +# Each task to run has to be defined through a single line +# indicating with different fields when the task will be run +# and what command to run for the task +# +# To define the time you can provide concrete values for +# minute (m), hour (h), day of month (dom), month (mon), +# and day of week (dow) or use '*' in these fields (for 'any').# +# Notice that tasks will be started based on the cron's system +# daemon's notion of time and timezones. +# +# Output of the crontab jobs (including errors) is sent through +# email to the user the crontab file belongs to (unless redirected). +# +# For example, you can run a backup of all your user accounts +# at 5 a.m every week with: +# 0 5 * * 1 tar -zcf /var/backups/home.tgz /home/ +# +# For more information see the manual pages of crontab(5) and cron(8) +# +# m h dom mon dow command +*/1 * * * * /app/cron.sh +*/1 * * * * echo Hello diff --git a/entrypoint.sh b/entrypoint.sh index e69de29..0ad2ecf 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -0,0 +1,7 @@ +#!/bin/bash +echo "Docker container has been started" + +cron && tail -f /var/log/cron.log +export FLASK_APP=serve.py; flask run --host=0.0.0.0 +# sudo crontab cronjob +# sudo cron -f \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index a676515..b2ef35a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ feedparser -Flask +flask numpy +requests scikit-learn sqlitedict From f181ae609ac3a3bc903f2ac6cc569668e6b6f54f Mon Sep 17 00:00:00 2001 From: Heiko Joerg Schick Date: Wed, 30 Aug 2023 14:17:11 +0200 Subject: [PATCH 4/5] Removed unused files --- cronjob | 24 ------------------------ entrypoint.sh | 7 ------- 2 files changed, 31 deletions(-) delete mode 100644 cronjob delete mode 100644 entrypoint.sh diff --git a/cronjob b/cronjob deleted file mode 100644 index 4223015..0000000 --- a/cronjob +++ /dev/null @@ -1,24 +0,0 @@ -# Edit this file to introduce tasks to be run by cron. -# -# Each task to run has to be defined through a single line -# indicating with different fields when the task will be run -# and what command to run for the task -# -# To define the time you can provide concrete values for -# minute (m), hour (h), day of month (dom), month (mon), -# and day of week (dow) or use '*' in these fields (for 'any').# -# Notice that tasks will be started based on the cron's system -# daemon's notion of time and timezones. -# -# Output of the crontab jobs (including errors) is sent through -# email to the user the crontab file belongs to (unless redirected). -# -# For example, you can run a backup of all your user accounts -# at 5 a.m every week with: -# 0 5 * * 1 tar -zcf /var/backups/home.tgz /home/ -# -# For more information see the manual pages of crontab(5) and cron(8) -# -# m h dom mon dow command -*/1 * * * * /app/cron.sh -*/1 * * * * echo Hello diff --git a/entrypoint.sh b/entrypoint.sh deleted file mode 100644 index 0ad2ecf..0000000 --- a/entrypoint.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -echo "Docker container has been started" - -cron && tail -f /var/log/cron.log -export FLASK_APP=serve.py; flask run --host=0.0.0.0 -# sudo crontab cronjob -# sudo cron -f \ No newline at end of file From 6eb4cfc56b43ca84aebe77a9eebd8ed8e791b88a Mon Sep 17 00:00:00 2001 From: Heiko Joerg Schick Date: Wed, 30 Aug 2023 19:08:10 +0200 Subject: [PATCH 5/5] Enlarged query string for arxiv. --- arxiv_daemon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arxiv_daemon.py b/arxiv_daemon.py index 6ce68c7..4239e79 100644 --- a/arxiv_daemon.py +++ b/arxiv_daemon.py @@ -30,7 +30,7 @@ if __name__ == '__main__': """ # query string of papers to look for - q = 'cat:cs.CV+OR+cat:cs.LG+OR+cat:cs.CL+OR+cat:cs.AI+OR+cat:cs.NE+OR+cat:cs.RO' + q = 'cat:cs.CV+OR+cat:cs.LG+OR+cat:cs.CL+OR+cat:cs.AI+OR+cat:cs.NE+OR+cat:cs.RO+OR+cat:cs.AR+OR+cat:cs.DC' pdb = get_papers_db(flag='c') mdb = get_metas_db(flag='c')