base commit

KissPeter · Jul 12, 2023 · 38f9108 · 38f9108
1 parent 63d70d3
commit 38f9108
Show file tree

Hide file tree

Showing 18 changed files with 389 additions and 0 deletions.
diff --git a/.github/.github/workflows/build_dockers.yml b/.github/.github/workflows/build_dockers.yml
@@ -0,0 +1,38 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Python application
+
+on:
+  push:
+    branches:
+      - '**'
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  docker:
+    runs-on: ubuntu-latest
+    timeout-minutes: 300
+    steps:
+      - name: 'Checkout Repository'
+        uses: actions/checkout@v3
+      - uses: jpribyl/[email protected]
+        continue-on-error: true
+      - name: 'Docker Compose'
+        uses: isbang/[email protected]
+        with:
+          compose-file: "./docker-compose.yml"
+          up-flags: "--build --force-recreate"
+      - name: Save text_summary
+        if: success()
+        uses: ishworkh/docker-image-artifact-upload@v1
+        with:
+          image: "text_summary:latest"
+          retention_days: "30"
+      - name: Save sentiment score
+        if: success()
+        uses: ishworkh/docker-image-artifact-upload@v1
+        with:
+          image: "sentiment_score:latest"
+          retention_days: "30"
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/Dockerfile_download_model_data b/Dockerfile_download_model_data
@@ -0,0 +1,11 @@
+FROM python:3.11.3-slim-buster
+ENV PIP_NO_CACHE_DIR=yes
+RUN pip install --upgrade pip
+WORKDIR /src/
+COPY download_models.py .
+COPY requirements.txt .
+COPY torch_requirements.txt .
+RUN pip3 install -r requirements.txt
+RUN pip3 install -r torch_requirements.txt
+RUN find .
+CMD python3 download_models.py
diff --git a/Dockerfile_sentiment b/Dockerfile_sentiment
@@ -0,0 +1,17 @@
+FROM python:3.11.3-slim-buster
+ENV PIP_NO_CACHE_DIR=yes
+RUN DEBIAN_FRONTEND=noninteractive apt update && apt upgrade && apt install -y --no-install-recommends nginx-light && rm -rf /var/lib/apt/lists/*
+RUN pip install --upgrade pip
+WORKDIR /src/
+COPY sentiment_scoring .
+COPY start_services.sh .
+RUN rm -rf /etc/nginx/nginx.conf
+COPY nginx.conf /etc/nginx/nginx.conf
+COPY gconf.py .
+COPY requirements.txt .
+COPY torch_requirements.txt .
+RUN pip3 install -r requirements.txt
+RUN pip3 install -r torch_requirements.txt
+ENV GUNICORN_CMD_ARGS="-c gconf.py --reuse-port"
+EXPOSE 80
+ENTRYPOINT /src/start_services.sh
diff --git a/Dockerfile_summary b/Dockerfile_summary
@@ -0,0 +1,17 @@
+FROM python:3.11.3-slim-buster
+ENV PIP_NO_CACHE_DIR=yes
+RUN DEBIAN_FRONTEND=noninteractive apt update && apt upgrade && apt install -y --no-install-recommends nginx-light && rm -rf /var/lib/apt/lists/*
+RUN pip install --upgrade pip
+WORKDIR /src/
+COPY text_summary .
+COPY start_services.sh .
+RUN rm -rf /etc/nginx/nginx.conf
+COPY nginx.conf /etc/nginx/nginx.conf
+COPY gconf.py .
+COPY requirements.txt .
+COPY torch_requirements.txt .
+RUN pip3 install -r requirements.txt
+RUN pip3 install -r torch_requirements.txt
+ENV GUNICORN_CMD_ARGS="-c gconf.py --reuse-port"
+EXPOSE 80
+ENTRYPOINT /src/start_services.sh
diff --git a/README.md b/README.md
@@ -1,2 +1,31 @@
 # ML
 ML models - sample code
+
+# Links
+ * https://www.width.ai/post/bert-for-extractive-text-summarization-on-lectures
+ * https://www.width.ai/post/spacy-text-classification
+ * https://spacy.io/universe/project/coreferee
+ * https://github.com/richardpaulhudson/coreferee#getting-started
+ * https://www.width.ai/post/4-long-text-summarization-methods
+ * https://www.width.ai/post/gpt3-summarizer
+ * https://openai.com/research/summarizing-books
+ * https://github.com/amoramine/Pegasus_with_Longformer_summarization
+ * https://pypi.org/project/bert-extractive-summarizer/
+ * https://github.com/dmmiller612/bert-extractive-summarizer
+ * https://github.com/huggingface/neuralcoref
+ * https://hub.docker.com/r/openkbs/text-summary-docker
+ * https://medium.com/saturdays-ai/building-a-text-summarizer-in-python-using-nltk-and-scikit-learn-class-tfidfvectorizer-2207c4235548
+ * https://huggingface.co/docs/transformers/v4.24.0/en/main_classes/pipelines#transformers.SummarizationPipeline
+ * https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct
+ * https://huggingface.co/docs/transformers/v4.24.0/en/main_classes/pipelines#transformers.Text2TextGenerationPipeline
+
+
+
+# How to process long text?
+* https://smrzr.io/
+Based on these projects, and the attached sample code, build an own solution:
+https://github.com/dmmiller612/bert-extractive-summarizer/network/dependents
+
+# Seems like there are 2 pillars of a good summarizer:
+1. Coreference technique: identify and substitute the subject of the sentences accurately
+2. Extractive Summarization: ability  to cope with long tokens
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,65 @@
+version: '3.1'
+
+services:
+
+  download_model_data:
+    container_name: download_model_data
+    build:
+      context: .
+      dockerfile: Dockerfile_download_model_data
+    volumes:
+      - "./sentiment_scoring/model:/src/sentiment_scoring"
+      - "./text_summary/model:/src/summarization"
+
+  text_summary:
+    depends_on:
+      - download_model_data
+    container_name: text_summary
+    build:
+      context: .
+      dockerfile: Dockerfile_summary
+    image: text_summary:latest
+    environment:
+      WORKERS: 1
+      THREADS: 1
+    ports:
+      - "8002:80"
+    deploy:
+      resources:
+        limits:
+          cpus: '2'
+    restart: 'no'
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+      nofile:
+        soft: 65536
+        hard: 65536
+
+  sentiment_score:
+    depends_on:
+      - download_model_data
+    container_name: sentiment_score
+    build:
+      context: .
+      dockerfile: Dockerfile_sentiment
+    image: sentiment_score:latest
+    environment:
+      WORKERS: 1
+      THREADS: 1
+    ports:
+      - "8002:80"
+    deploy:
+      resources:
+        limits:
+          cpus: '2'
+    restart: 'no'
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+      nofile:
+        soft: 65536
+        hard: 65536
+
diff --git a/download_models.py b/download_models.py
@@ -0,0 +1,9 @@
+
+from transformers import pipeline
+
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+summarizer.save_pretrained("./summarization")
+
+
+summarizer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
+summarizer.save_pretrained("./sentiment-analysis")
diff --git a/gconf.py b/gconf.py
@@ -0,0 +1,19 @@
+import os
+
+bind = 'unix:/tmp/gunicorn.sock'
+
+workers = os.getenv('WORKERS', 1)
+threads = os.getenv('THREADS', 1)
+# backlog - The number of pending connections.
+backlog = 64
+# Workers silent for more than this many seconds are killed and restarted.
+timeout = 300
+# Timeout for graceful workers restart.
+graceful_timeout = 300
+# The maximum number of requests a worker will process before restarting.
+max_requests = 0
+max_requests_jitter = 0
+worker_class = 'uvicorn.workers.UvicornWorker'
+worker_tmp_dir = '/dev/shm'
+# The number of seconds to wait for requests on a Keep-Alive connection.
+keepalive = 120
diff --git a/nginx.conf b/nginx.conf
@@ -0,0 +1,61 @@
+# based on https://www.uvicorn.org/deployment/#running-behind-nginx
+user www-data;
+worker_processes 1;
+
+pid /run/nginx.pid;
+include /etc/nginx/modules-enabled/*.conf;
+events {
+  worker_connections  1024;
+}
+http {
+        sendfile on;
+        tcp_nopush on;
+        types_hash_max_size 2048;
+        include /etc/nginx/mime.types;
+        default_type application/octet-stream;
+        access_log /dev/stdout;
+        error_log stderr info;
+        gzip off;
+
+        upstream gunicorn {
+            # fail_timeout=0 means we always retry an upstream even if it failed
+            # to return a good HTTP response
+            server unix:/tmp/gunicorn.sock fail_timeout=0;
+            keepalive 8;
+        }
+        server {
+            listen 80 default_server;
+            server_name _;
+            server_tokens                         off;
+            client_max_body_size                  20M;
+            keepalive_requests                    5000;
+            keepalive_timeout                     120;
+            set_real_ip_from                      10.0.0.0/8;
+            set_real_ip_from                      172.16.0.0/12;
+            set_real_ip_from                      192.168.0.0/16;
+            real_ip_header                        X-Forwarded-For;
+            real_ip_recursive                     on;
+            proxy_headers_hash_max_size           1024;
+            proxy_headers_hash_bucket_size        128;
+            location / {
+
+                    proxy_set_header Host               $host;
+                    proxy_set_header X-Real-IP          $remote_addr;
+                    proxy_set_header X-Forwarded-For    $proxy_add_x_forwarded_for;
+                    proxy_set_header X-Forwarded-Proto  $http_x_forwarded_proto;
+                    proxy_set_header X-Forwarded-Proto $scheme;
+                    proxy_pass_header                   Server;
+                    proxy_ignore_client_abort           on;
+                    proxy_connect_timeout               65s;
+                    proxy_read_timeout                  65s;
+                    proxy_send_timeout                  65s;
+                    proxy_redirect                      off;
+                    proxy_http_version                  1.1;
+                    proxy_set_header Connection         "";
+                    proxy_buffering                     off;
+                    proxy_redirect                      off;
+                    proxy_pass                          http://gunicorn;
+            }
+
+        }
+}
diff --git a/notes.md b/notes.md
@@ -0,0 +1,18 @@
+https://github.com/dmmiller612/bert-extractive-summarizer
+https://github.com/huggingface/neuralcoref
+https://hub.docker.com/r/openkbs/text-summary-docker
+https://medium.com/saturdays-ai/building-a-text-summarizer-in-python-using-nltk-and-scikit-learn-class-tfidfvectorizer-2207c4235548
+https://huggingface.co/docs/transformers/v4.24.0/en/main_classes/pipelines#transformers.SummarizationPipeline
+https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct
+https://huggingface.co/docs/transformers/v4.24.0/en/main_classes/pipelines#transformers.Text2TextGenerationPipeline
+
+
+-..
+https://huggingface.co/docs/transformers/v4.24.0/en/main_classes/pipelines#transformers.SummarizationPipeline
+https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct
+
+https://huggingface.co/docs/transformers/v4.24.0/en/main_classes/pipelines#transformers.Text2TextGenerationPipeline
+
+https://pytorch.org/get-started/locally/
+docker image tag  text_summary:latest cloud.canister.io:5000/kisspeter/public/text_summary:latest
+docker push  -a  cloud.canister.io:5000/kisspeter/public/text_summary
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,7 @@
+starlette
+gunicorn
+fastapi
+uvicorn
+python-multipart
+transformers
+markupsafe==2.0.1
diff --git a/sentiment_scoring/__init__.py b/sentiment_scoring/__init__.py
diff --git a/sentiment_scoring/app.py b/sentiment_scoring/app.py
@@ -0,0 +1,44 @@
+import time
+
+from fastapi import FastAPI, Form
+from starlette.datastructures import MutableHeaders
+from starlette.types import ASGIApp, Message, Receive, Scope, Send
+from transformers import pipeline
+
+app = FastAPI(debug=True)
+# Prepare model in local folder:
+# summarizer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")
+# summarizer.save_pretrained("./model")
+
+# cardiffnlp/twitter-roberta-base-sentiment-latest
+model_path = "model"
+sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)
+
+
+@app.post("/v1/score/")
+def create_item(text: str = Form(), ):
+    return sentiment_task(text)
+
+
+class STARLETTEProcessTimeMiddleware:
+    app: ASGIApp
+
+    def __init__(self, app: ASGIApp, ) -> None:
+        self.app = app
+
+    async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
+        if scope["type"] != "http":
+            await self.app(scope, receive, send)
+            return
+        start_time = time.time()
+
+        async def send_wrapper(message: Message) -> None:
+            if message["type"] == "http.response.start":
+                headers = MutableHeaders(scope=message)
+                headers.append("X-Process-Time", str(round(time.time() - start_time, 4)))
+            await send(message)
+
+        await self.app(scope, receive, send_wrapper)
+
+
+app.add_middleware(STARLETTEProcessTimeMiddleware)
diff --git a/start_services.sh b/start_services.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+nginx && gunicorn app:app
diff --git a/text_summary/__init__.py b/text_summary/__init__.py