diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..7e1ce60 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +*.tex linguist-detectable=false +*.ipynb linguist-detectable=false \ No newline at end of file diff --git a/.gitignore b/.gitignore index d090f07..5434fa5 100644 --- a/.gitignore +++ b/.gitignore @@ -2,10 +2,6 @@ **/.idea/** -# Project Proposal -!proposal/project-proposal.pdf - - ## Python **/__pycache__/** **/venv/** \ No newline at end of file diff --git a/proposal/.gitignore b/documentation/.gitignore similarity index 96% rename from proposal/.gitignore rename to documentation/.gitignore index 48bfdcd..2a9beee 100644 --- a/proposal/.gitignore +++ b/documentation/.gitignore @@ -299,4 +299,8 @@ TSWLatexianTemp* # glossaries *.glstex -# End of https://www.toptal.com/developers/gitignore/api/tex \ No newline at end of file +# End of https://www.toptal.com/developers/gitignore/api/tex + + +# Project Proposal +!proposal/project-proposal.pdf \ No newline at end of file diff --git a/documentation/PSPD/acronym.tex b/documentation/PSPD/acronym.tex new file mode 100644 index 0000000..7ce1c04 --- /dev/null +++ b/documentation/PSPD/acronym.tex @@ -0,0 +1,26 @@ +\chapter*{List of Acronyms} + +\begin{acronym} +% \acro{iaas}[IaaS]{Infrastructure as a Service} +\acro{sres}[SRE]{Site Reliability Engineer} +\acro{sli}[SLI]{Service Level Indicator} +\acro{sre}[SRE]{Site Reliability Engineering} +\acro{apm}[APM]{Application Performance Monitoring} +\acro{mttr}[MTTR]{Mean Time To Recovery} +% \acro{gan}[GAN]{Generative adversarial networks} +% \acro{hhmm}[HHMM]{Hierarchical hidden Markov model} +% \acro{fsl}[FSL]{Few-shot Learning} +% \acro{sdlc}[SDLC]{Software Development Life Cycle} +% \acro{ooad}[OOAD]{Object-oriented analysis and design} +\acro{ebpf}[eBPF]{Extended Berkeley Packet Filter} +% \acro{sla}[SLA]{Service-Level Agreement} +% \acro{saas}[SaaS]{Software as a service} +% \acro{vm}[VM]{Virtual Machine} +% \acro{cncf}[CNCF]{Cloud Native Computing Foundation} + +\acro{aiops}[AIOps]{Artificial Intelligence for IT operations} +% \acro{sre}[SRE]{Site Reliability Engineering} +\acro{gazer}[Gazer]{Telemetry extraction agent} +\acro{sherlock}[Sherlock]{AI-engine} +\acro{lazy-koala-operator}[Operator]{Lazy Koala Resource Manager} +\end{acronym} \ No newline at end of file diff --git a/proposal/assets/IIT-Logo.png b/documentation/PSPD/assets/IIT-Logo.png similarity index 100% rename from proposal/assets/IIT-Logo.png rename to documentation/PSPD/assets/IIT-Logo.png diff --git a/documentation/PSPD/assets/appendix/poc-results.png b/documentation/PSPD/assets/appendix/poc-results.png new file mode 100644 index 0000000..ed91087 Binary files /dev/null and b/documentation/PSPD/assets/appendix/poc-results.png differ diff --git a/documentation/PSPD/assets/appendix/prometheus-dashboard.png b/documentation/PSPD/assets/appendix/prometheus-dashboard.png new file mode 100644 index 0000000..18209b5 Binary files /dev/null and b/documentation/PSPD/assets/appendix/prometheus-dashboard.png differ diff --git a/documentation/PSPD/assets/implementation/abnormal-state.png b/documentation/PSPD/assets/implementation/abnormal-state.png new file mode 100644 index 0000000..27da991 Binary files /dev/null and b/documentation/PSPD/assets/implementation/abnormal-state.png differ diff --git a/documentation/PSPD/assets/implementation/after-normalization.png b/documentation/PSPD/assets/implementation/after-normalization.png new file mode 100644 index 0000000..45ac797 Binary files /dev/null and b/documentation/PSPD/assets/implementation/after-normalization.png differ diff --git a/documentation/PSPD/assets/implementation/backlog-probe.png b/documentation/PSPD/assets/implementation/backlog-probe.png new file mode 100644 index 0000000..9f2b5b7 Binary files /dev/null and b/documentation/PSPD/assets/implementation/backlog-probe.png differ diff --git a/documentation/PSPD/assets/implementation/before-normalization.png b/documentation/PSPD/assets/implementation/before-normalization.png new file mode 100644 index 0000000..2791ae3 Binary files /dev/null and b/documentation/PSPD/assets/implementation/before-normalization.png differ diff --git a/documentation/PSPD/assets/implementation/gazer-enrich.png b/documentation/PSPD/assets/implementation/gazer-enrich.png new file mode 100644 index 0000000..215aa4a Binary files /dev/null and b/documentation/PSPD/assets/implementation/gazer-enrich.png differ diff --git a/documentation/PSPD/assets/implementation/kubernetes-control-loop.png b/documentation/PSPD/assets/implementation/kubernetes-control-loop.png new file mode 100644 index 0000000..5042e69 Binary files /dev/null and b/documentation/PSPD/assets/implementation/kubernetes-control-loop.png differ diff --git a/documentation/PSPD/assets/implementation/normal-state.png b/documentation/PSPD/assets/implementation/normal-state.png new file mode 100644 index 0000000..9c8bf82 Binary files /dev/null and b/documentation/PSPD/assets/implementation/normal-state.png differ diff --git a/documentation/PSPD/assets/implementation/normalize-data.png b/documentation/PSPD/assets/implementation/normalize-data.png new file mode 100644 index 0000000..dca44c1 Binary files /dev/null and b/documentation/PSPD/assets/implementation/normalize-data.png differ diff --git a/documentation/PSPD/assets/implementation/reconcile-loop.png b/documentation/PSPD/assets/implementation/reconcile-loop.png new file mode 100644 index 0000000..503468e Binary files /dev/null and b/documentation/PSPD/assets/implementation/reconcile-loop.png differ diff --git a/documentation/PSPD/assets/implementation/service-benchmark.png b/documentation/PSPD/assets/implementation/service-benchmark.png new file mode 100644 index 0000000..0b9500b Binary files /dev/null and b/documentation/PSPD/assets/implementation/service-benchmark.png differ diff --git a/documentation/PSPD/assets/implementation/technology-stack.png b/documentation/PSPD/assets/implementation/technology-stack.png new file mode 100644 index 0000000..f430e99 Binary files /dev/null and b/documentation/PSPD/assets/implementation/technology-stack.png differ diff --git a/documentation/PSPD/assets/implementation/visualize-representation.png b/documentation/PSPD/assets/implementation/visualize-representation.png new file mode 100644 index 0000000..e05467c Binary files /dev/null and b/documentation/PSPD/assets/implementation/visualize-representation.png differ diff --git a/proposal/assets/High-level-system-diagram.png b/documentation/PSPD/assets/introduction/High-level-system-diagram.png similarity index 100% rename from proposal/assets/High-level-system-diagram.png rename to documentation/PSPD/assets/introduction/High-level-system-diagram.png diff --git a/documentation/PSPD/assets/literature-review/Container-orchestration-engines.png b/documentation/PSPD/assets/literature-review/Container-orchestration-engines.png new file mode 100644 index 0000000..5a97c36 Binary files /dev/null and b/documentation/PSPD/assets/literature-review/Container-orchestration-engines.png differ diff --git a/documentation/PSPD/assets/literature-review/concept-map.png b/documentation/PSPD/assets/literature-review/concept-map.png new file mode 100644 index 0000000..21bd5e7 Binary files /dev/null and b/documentation/PSPD/assets/literature-review/concept-map.png differ diff --git a/documentation/PSPD/assets/literature-review/containers-vs-virtual-machines.jpg b/documentation/PSPD/assets/literature-review/containers-vs-virtual-machines.jpg new file mode 100644 index 0000000..d30c83a Binary files /dev/null and b/documentation/PSPD/assets/literature-review/containers-vs-virtual-machines.jpg differ diff --git a/documentation/PSPD/assets/literature-review/demo.png b/documentation/PSPD/assets/literature-review/demo.png new file mode 100644 index 0000000..ede76db Binary files /dev/null and b/documentation/PSPD/assets/literature-review/demo.png differ diff --git a/documentation/PSPD/assets/literature-review/ebpf-architecture.png b/documentation/PSPD/assets/literature-review/ebpf-architecture.png new file mode 100644 index 0000000..f972baa Binary files /dev/null and b/documentation/PSPD/assets/literature-review/ebpf-architecture.png differ diff --git a/documentation/PSPD/assets/literature-review/linkerd-benchmark.png b/documentation/PSPD/assets/literature-review/linkerd-benchmark.png new file mode 100644 index 0000000..862d9e7 Binary files /dev/null and b/documentation/PSPD/assets/literature-review/linkerd-benchmark.png differ diff --git a/documentation/PSPD/assets/literature-review/num-of-anomaly-detection-papers.jpg b/documentation/PSPD/assets/literature-review/num-of-anomaly-detection-papers.jpg new file mode 100644 index 0000000..793b441 Binary files /dev/null and b/documentation/PSPD/assets/literature-review/num-of-anomaly-detection-papers.jpg differ diff --git a/documentation/PSPD/assets/literature-review/sidecar-proxy.png b/documentation/PSPD/assets/literature-review/sidecar-proxy.png new file mode 100644 index 0000000..d284cc0 Binary files /dev/null and b/documentation/PSPD/assets/literature-review/sidecar-proxy.png differ diff --git a/proposal/assets/gantt-chart.jpg b/documentation/PSPD/assets/methodology/gantt-chart.jpg similarity index 100% rename from proposal/assets/gantt-chart.jpg rename to documentation/PSPD/assets/methodology/gantt-chart.jpg diff --git a/documentation/PSPD/assets/requirement-specification/contex-digram.png b/documentation/PSPD/assets/requirement-specification/contex-digram.png new file mode 100644 index 0000000..fb00bae Binary files /dev/null and b/documentation/PSPD/assets/requirement-specification/contex-digram.png differ diff --git a/documentation/PSPD/assets/requirement-specification/onion-model.png b/documentation/PSPD/assets/requirement-specification/onion-model.png new file mode 100644 index 0000000..96f8bd5 Binary files /dev/null and b/documentation/PSPD/assets/requirement-specification/onion-model.png differ diff --git a/documentation/PSPD/assets/requirement-specification/poc-autoencoder.png b/documentation/PSPD/assets/requirement-specification/poc-autoencoder.png new file mode 100644 index 0000000..d71f983 Binary files /dev/null and b/documentation/PSPD/assets/requirement-specification/poc-autoencoder.png differ diff --git a/documentation/PSPD/assets/requirement-specification/rich-picture.png b/documentation/PSPD/assets/requirement-specification/rich-picture.png new file mode 100644 index 0000000..d641728 Binary files /dev/null and b/documentation/PSPD/assets/requirement-specification/rich-picture.png differ diff --git a/documentation/PSPD/assets/requirement-specification/use-case.png b/documentation/PSPD/assets/requirement-specification/use-case.png new file mode 100644 index 0000000..a7512cd Binary files /dev/null and b/documentation/PSPD/assets/requirement-specification/use-case.png differ diff --git a/documentation/PSPD/assets/system-design/data-flow-level-1.png b/documentation/PSPD/assets/system-design/data-flow-level-1.png new file mode 100644 index 0000000..0ece42d Binary files /dev/null and b/documentation/PSPD/assets/system-design/data-flow-level-1.png differ diff --git a/documentation/PSPD/assets/system-design/data-flow-level-2.png b/documentation/PSPD/assets/system-design/data-flow-level-2.png new file mode 100644 index 0000000..f404ba4 Binary files /dev/null and b/documentation/PSPD/assets/system-design/data-flow-level-2.png differ diff --git a/documentation/PSPD/assets/system-design/sequence-diagram-1.png b/documentation/PSPD/assets/system-design/sequence-diagram-1.png new file mode 100644 index 0000000..b39f8ab Binary files /dev/null and b/documentation/PSPD/assets/system-design/sequence-diagram-1.png differ diff --git a/documentation/PSPD/assets/system-design/sequence-diagram-2.png b/documentation/PSPD/assets/system-design/sequence-diagram-2.png new file mode 100644 index 0000000..8fe761d Binary files /dev/null and b/documentation/PSPD/assets/system-design/sequence-diagram-2.png differ diff --git a/documentation/PSPD/assets/system-design/tier-architecture.png b/documentation/PSPD/assets/system-design/tier-architecture.png new file mode 100644 index 0000000..38b58ff Binary files /dev/null and b/documentation/PSPD/assets/system-design/tier-architecture.png differ diff --git a/documentation/PSPD/assets/system-design/ui-home.png b/documentation/PSPD/assets/system-design/ui-home.png new file mode 100644 index 0000000..0256b20 Binary files /dev/null and b/documentation/PSPD/assets/system-design/ui-home.png differ diff --git a/documentation/PSPD/assets/system-design/ui-settings.png b/documentation/PSPD/assets/system-design/ui-settings.png new file mode 100644 index 0000000..c70570f Binary files /dev/null and b/documentation/PSPD/assets/system-design/ui-settings.png differ diff --git a/documentation/PSPD/assets/uow-logo.png b/documentation/PSPD/assets/uow-logo.png new file mode 100644 index 0000000..5d66916 Binary files /dev/null and b/documentation/PSPD/assets/uow-logo.png differ diff --git a/documentation/PSPD/chapters/abstract.tex b/documentation/PSPD/chapters/abstract.tex new file mode 100644 index 0000000..ffd2648 --- /dev/null +++ b/documentation/PSPD/chapters/abstract.tex @@ -0,0 +1,18 @@ +\chapter*{Abstract} + +Cloud computing has shown considerable growth in the past few years, due to its scalability and ease of use. With this change, a new programming paradigm called cloud-native was born. Cloud-native applications are often developed as a set of stand-alone microservices yet could depend on each other to provide a unified experience. Even though microservices introduce a lot of benefits when it comes to flexibility and scalability it could be a nightmare to operate in production. Specifically, when operating a large system with hundreds of microservices talking to each other, the smallest problem could result in failures all around the system. + +% Cloud computing is a steady rise for the past few years due to its scalability and ease of use. With this change, a new programming paradigm called cloud-native was born. Cloud-native applications are often developed as a set of stand-alone microservices yet, it could depend on each other to provide a unified experience. + +% This helps different teams to work on different services which increases the development velocity. This works well for medium to large companies but over time this mesh of services could become very complicated to a point where it's very difficult for a single person to understand the entire system. When the system consists of thousands of individual services talking and depending on each other, the network layer of that system becomes chaotic. A failure in a single point could create a ripple effect across the entire system. When something like that happens it could take a considerable amount of time to zero in on the exact point of the failure. + +The focus of this project are in two-folds. First, the authors introduce a robust Kubernetes native toolkit that helps both researchers and developers collect and process service telemetry data with zero instrumentation. Secondly, the authors proposed a novel way of detecting anomalies by encoding raw metric data into an image-like structure and using a convolutional autoencoder to learn the general data distribution for each service and detecting outliers. Finally, a weighted graph was used along with anomaly scores calculated prior to finding out possible root cause for any system anomaly. + +Initial test results show that the telemetry extraction components are both resilient and lightweight even under the sustained load, while the anomaly prediction algorithm seems to converge on target learning goals. +\newline +\newline +\textbf{Keywords}: +AIOps, Monitoring, Disaster Recovery, eBPF, Kubernetes +\newline +\textbf{Subject Descriptors}: +• Computing methodologies $\rightarrow$ Machine learning $\rightarrow$ Learning paradigms $\rightarrow$ Unsupervised learning $\rightarrow$ Anomaly detection • Computer systems organization $\rightarrow$ Architectures $\rightarrow$ Distributed architectures $\rightarrow$ Cloud computing \ No newline at end of file diff --git a/documentation/PSPD/chapters/appendix/main.tex b/documentation/PSPD/chapters/appendix/main.tex new file mode 100644 index 0000000..57f8479 --- /dev/null +++ b/documentation/PSPD/chapters/appendix/main.tex @@ -0,0 +1,6 @@ +\chapter*{Appendix} +\begin{appendices} +\input{chapters/appendix/use-case-description} +\input{chapters/appendix/poc} +\input{chapters/appendix/prometheus-dashboard} +\end{appendices} \ No newline at end of file diff --git a/documentation/PSPD/chapters/appendix/poc.tex b/documentation/PSPD/chapters/appendix/poc.tex new file mode 100644 index 0000000..ddb1e10 --- /dev/null +++ b/documentation/PSPD/chapters/appendix/poc.tex @@ -0,0 +1,6 @@ +\chapter{Proof of Concept Results}\label{appendix:poc-results} + +\begin{figure}[H] + \includegraphics[width=14cm]{assets/appendix/poc-results.png} + \caption{Proof of concept results (self-composed)} +\end{figure} \ No newline at end of file diff --git a/documentation/PSPD/chapters/appendix/prometheus-dashboard.tex b/documentation/PSPD/chapters/appendix/prometheus-dashboard.tex new file mode 100644 index 0000000..3f629f3 --- /dev/null +++ b/documentation/PSPD/chapters/appendix/prometheus-dashboard.tex @@ -0,0 +1,6 @@ +\chapter{Prometheus Dashboard}\label{appendix:prometheus-dashboard} + +\begin{figure}[H] + \includegraphics[width=16.5cm]{assets/appendix/prometheus-dashboard.png} + \caption{Prometheus dashboard with collected data (self-composed)} +\end{figure} \ No newline at end of file diff --git a/documentation/PSPD/chapters/appendix/use-case-description.tex b/documentation/PSPD/chapters/appendix/use-case-description.tex new file mode 100644 index 0000000..b888445 --- /dev/null +++ b/documentation/PSPD/chapters/appendix/use-case-description.tex @@ -0,0 +1,126 @@ +{\let\clearpage\relax\chapter{Use Case Descriptions}\label{appendix:use-case-description}} + +\UseCaseDescription +{UC-01} +{Deploy Lazy Koala} +{Install \ac{lazy-koala-operator} to a Kubernetes cluster} +{Reliability Engineer} +{\begin{CompactItemizes} + \item A Kubernetes cluster running. + \item kubectl installed and configured to talk to the cluster. + \item Helm CLI installed. +\end{CompactItemizes}} +{N/A} +{N/A} +{\begin{CompactEnumerate} + \item Add Helm remote. + \item Run helm install command. + \item Kube API acknowledges the changes. + \item Display content of Notes.txt +\end{CompactEnumerate}} +{{\begin{CompactEnumerate} + \item Apply Kubernetes Manifest found in the code repository. + \item Kube API acknowledges the changes. +\end{CompactEnumerate}} +{\textbf{E1}: A \ac{lazy-koala-operator} couldn’t achieve desired state +\vspace{-4mm}\begin{enumerate} + \item The \ac{lazy-koala-operator} retries to achieve the desired state with an exponential backoff +\vspace{-7mm}\end{enumerate}} +{\begin{CompactItemizes} + \item \ac{lazy-koala-operator} deployed on the cluster. + \item Instance of \ac{gazer} deployed on every node. + \item New permission rules are registered with Kube API. +\end{CompactItemizes}}} + +\vspace{-2em} +\UseCaseDescription +{UC-02} +{Update Configuration} +{Add or Remove a service from a monitored list.} +{Reliability Engineer} +{\begin{CompactItemizes} + \item kubectl installed and configured to talk to a Kubernetes cluster. + \item The Kubernetes cluster has a \ac{lazy-koala-operator} deployed. + \item Established port forwarding connection with \ac{lazy-koala-operator}. +\end{CompactItemizes}} +{N/A} +{N/A} +{\begin{CompactEnumerate} + \item Visit the forwarded port on the local machine. + \item Open the “Services” tab. + \item Click Attach Inspector. + \item Select the namespace and the service. + \item Click Attach. + \item Status update sent to kube API. +\end{CompactEnumerate}} +{{\begin{CompactEnumerate} + \item Visit the forwarded port on the local machine. + \item Open the “Services” tab. + \item Scroll to the relevant record. + \item Press the delete button next to the name. + \item Confirm the action. + \item Status update sent to kube API. +\end{CompactEnumerate}} +{\textbf{E1}: Kube API not available +\vspace{-4mm}\begin{enumerate} + \item Show an error to the user asking to retry in a bit. +\vspace{-7mm}\end{enumerate}} +{\begin{CompactItemizes} + \item A new Inspector resource is attached to the service. +\end{CompactItemizes}}} + +\vspace{-2em} +\UseCaseDescription +{UC-03} +{Purge Lazy Koala} +{Remove Lazy Koala from a Kubernetes cluster.} +{Reliability Engineer} +{\begin{CompactItemizes} + \item kubectl installed and configured to talk to a Kubernetes cluster. + \item The Kubernetes cluster has a \ac{lazy-koala-operator} deployed. +\end{CompactItemizes}} +{N/A} +{N/A} +{\begin{CompactEnumerate} + \item Find the helm release name. + \item Run helm uninstall . +\end{CompactEnumerate}} +{{\begin{CompactEnumerate} + \item Locate Kubernetes Manifest found in the code repository. + \item Run kubectl delete -f +\end{CompactEnumerate}} +{N/A} +{\begin{CompactItemizes} + \item All the resources provisioned by Lazy Koala including the \ac{lazy-koala-operator} itself get removed from the cluster. +\end{CompactItemizes}}} + +\vspace{-2em} +\UseCaseDescription +{UC-11} +{Reconcile on modified resources} +{Whenever a resource owned by the \ac{lazy-koala-operator} gets modified, kubelet invokes the reconciliation loop on the \ac{lazy-koala-operator}.} +{Kubelet} +{\begin{CompactItemizes} + \item \ac{lazy-koala-operator} is deployed +\end{CompactItemizes}} +{Read the cluster state} +{N/A} +{\begin{CompactEnumerate} + \item Resources get modified. + \item Kubelet invokes a reconciliation loop on the \ac{lazy-koala-operator}. + \item Check if the change is interesting. + \item Update children resources accordingly. +\end{CompactEnumerate}} +{{\begin{CompactEnumerate} + \item Resources get modified. + \item Kubelet invokes a reconciliation loop on the \ac{lazy-koala-operator}. + \item Check if the change is interesting. + \item Stop execution. +\end{CompactEnumerate}} +{\textbf{E1}: Error while reconciling +\vspace{-4mm}\begin{enumerate} + \item Retry with exponential backoff. +\vspace{-7mm}\end{enumerate}} +{\begin{CompactItemizes} + \item Cluster in the new desired state. +\end{CompactItemizes}}} \ No newline at end of file diff --git a/documentation/PSPD/chapters/implementation/chapter-overview.tex b/documentation/PSPD/chapters/implementation/chapter-overview.tex new file mode 100644 index 0000000..1245244 --- /dev/null +++ b/documentation/PSPD/chapters/implementation/chapter-overview.tex @@ -0,0 +1,3 @@ +\section{Chapter Overview} + +This chapter focuses on making the proposed system a reality. During this chapter, the author will talk about the tools and technologies he relyed on to complete the working prototype along with the reasoning behind all those choices. Then the author will share their experience implementing the core functionality of the system in line with his design goals. Finally, the chapter will be concluded with a self-reflection on achievements. \ No newline at end of file diff --git a/documentation/PSPD/chapters/implementation/chapter-summary.tex b/documentation/PSPD/chapters/implementation/chapter-summary.tex new file mode 100644 index 0000000..1ebecc0 --- /dev/null +++ b/documentation/PSPD/chapters/implementation/chapter-summary.tex @@ -0,0 +1,3 @@ +\section{Chapter Summary} + +In this chapter, the author shared their experiences and findings while implementing the proposed system. At the start, the author broke down the entire tech stack and explained all the tools and technologies used to build this project. Then the inner workings of the three core components were explained. Finally, the chapter concluded with a self-reflection where the author talked about the current results of the project. \ No newline at end of file diff --git a/documentation/PSPD/chapters/implementation/core-functionalities.tex b/documentation/PSPD/chapters/implementation/core-functionalities.tex new file mode 100644 index 0000000..7d69cf5 --- /dev/null +++ b/documentation/PSPD/chapters/implementation/core-functionalities.tex @@ -0,0 +1,92 @@ +\section{Implementation of Core Functionalities} + +This project contains three components that work together in order to makeup the entire system. In this section the inner working of each of those components will be explained. + +\subsection{Lazy Koala Resource Manager (Operator)} + +\ac{lazy-koala-operator} is the heart of the entire project. It is responsible for binding all other components together. In the context of Kubernetes, an operator is an agent that's running on the cluster, which is responsible for keeping one or more dedicated resources in sync with the desired state. + +For example, Kubernetes has a built-in resource named "Pod" which is the smallest deployable object in Kubernetes. So when a system administrator asked the Kube-API to create a pod out of a certain Docker container, Kube-API will create a resource object and attach it to the pod operator. Once that's done the pod operator will parse the pod resource specification and create a pod out of it. If for some reason the pod crashes or the administrator change the specification of the pod, the operator will be notified and it will re-run its reconciliation function to match the observed state with the desired state. + +\begin{figure}[H] + \includegraphics[width=11.5cm]{assets/implementation/kubernetes-control-loop.png} + \caption{Kubernetes control loop \citep{hausenblas2019programming}} + % \label{fig:reconcile-loop} +\end{figure} + + +Coming back to the \ac{lazy-koala-operator}, it has a Custom Resource Definition (CRD) called Inspector. In its specification, there are three required values. Deployment reference, DNS reference, and an URL to download the model that was fine-tuned for this specific deployment. So once a resource was deployed, the \ac{lazy-koala-operator} will first get the pods related to the deployment and populate the "scrapePoints" data structure with the IP address of each pod. Then it's going to find out the IP address mapped to the DNS reference and append that to the "scrapePoints". Then the \ac{lazy-koala-operator} will compare the new scrapePoints hashmap with the scrapePoints hashmap which was created in the previous iteration. Then it will identify which points needs to be added and which needs to be removed from the "gazer-config". After that the \ac{lazy-koala-operator} will pull down the "gazer-config" ConfigMap and runs the through the calculated changelog. Then it will send a patch request to the kube-api with the new state. Since this ConfigMap gets mounted to every \ac{gazer} instance via Kubernetes volumes system the changes made here will instantly be reflected with all of the \ac{gazer} instances. As a final step, an instance of \ac{sherlock} will be provisioned with the model given in the specification. + +Figure \ref{fig:reconcile-loop} shows a part of this reconciliation loop, which get repeated every time there is a change to an existing resource or whenever the user creates a new variant of this resource. + +\begin{figure}[H] + \includegraphics[height=15cm]{assets/implementation/reconcile-loop.png} + \caption{\ac{lazy-koala-operator} reconciliation loop (self-composed)} + \label{fig:reconcile-loop} +\end{figure} + + + +\subsection{Telemetry extraction agent (Gazer)} + +\ac{gazer} is the telemetry extraction agent that's get scheduled to run on every node on the cluster using a Kubernetes DaemonSet. \ac{gazer} is implemented in Python with the help of a library called BCC which acts as a frontend for \ac{ebpf} API. \ac{gazer} contains two kernel probes that get submitted to the kernel space at the startup. + +The first probe is a TCP SYN backlog monitor that keeps track of the backlog size of the TCP SYN queue. Since every TCP connection starts with a 3-way handshake and the SYN packet is the first packet that will be sent from the client in this sequence. The entire request is kept on hold till this packet is acknowledged by the system. Hence unusually higher SYN backlog is a strong signal of something going wrong. Figure \ref{fig:backlog-probe} showcase the core part of this probe and how the backlog size is calculated. + +\begin{figure}[H] + \includegraphics[width=14cm]{assets/implementation/backlog-probe.png} + \caption{eBPF probe to collecting tcp backlog (self-composed)} + \label{fig:backlog-probe} +\end{figure} + +Next is a tracepoint probe that gets invoked whenever inet\_sock\_set\_state kernel function is called. This probe extract five key data points from every TCP request. Transmitting and receiving IP address, the number of bytes sent and received and finally the time taken to complete the entire request. All these data are shipped to the userspace via a perf buffer. In user space, these raw data get enriched with the data collected from kube-api. + +As shown in the figure \ref{fig:gazer-enrich}, since \ac{gazer} already has a list of interested IP addresses which is given by the \ac{lazy-koala-operator}, it first check whether the request was made from an one of those IPs (here only the transmitting IP is checked since every request get duplicate pair of entries, one for the request, one for the response and all the other attributes are shared among that pair). If it's found, the parser tries to identify the receiving IP address too. If the receiving IP also has a match the request received and counter for that particular service will be increased. Then the parser moves on to record the number of bytes sent and received and the time taken to complete the request under identified service. Finally, these data points are exposed via an HTTP server so that the Prometheus scraper can read it and store it in the database so it can be consumed by \ac{sherlock}. + +\begin{figure}[H] + \includegraphics[width=14cm]{assets/implementation/gazer-enrich.png} + \caption{Code used to enrich TCP event data (self-composed)} + \label{fig:gazer-enrich} +\end{figure} + + +\subsection{AI-engine (Sherlock)} + +\ac{sherlock} is the AI engine which predict anomaly scores for each service which in turned used by the \ac{lazy-koala-operator} to figure out possible root causes for a particular issue. From a high level, this works by polling service telemetry for a predetermined number of time steps and running it through an convolutional autoencoder which tries to reconstruct the input data sequence. The difference between the input sequence and output sequence is called the reconstruction error and this will be use as the anomaly score for this specific service. Even though this process seems straight forward, a number of prepocessing steps has to be taken in order to make it easier on the model to converge on the learning goal. + +Since the collected metric data has different units, each feature of the dataset has a different range. This makes the training process very inefficient hence the model has to learn the concept of scales and unit first, and the backpropagation algorithm works best when the output values of the network are between 0-1 \citep{sola1997importance}. So to normalize this dataset a slightly modified version of the min-max normalization equation was used. This was done due to the fact that in most typical conditions metric values fluctuate between a fixed and limited range. If the min-max normalization function was applied as it is, the model may be hypersensitive to the slightest fluctuation. So adding this padding on both high and low ends, acts as an attention mechanism that helps the model to look for large variations rather than focusing on smaller ones. + +\begin{figure}[H] + \includegraphics[width=12cm]{assets/implementation/normalize-data.png} + \caption{Data normalization function (self-composed)} + \label{fig:normalize-data} +\end{figure} + + +\begin{figure}[H] + \centering + \begin{subfigure}[b]{0.48\textwidth} + \centering + \includegraphics[width=\textwidth]{assets/implementation/before-normalization.png} + \caption{Before Normalization} + \label{fig:before-normalization} + \end{subfigure} + \hfill + \begin{subfigure}[b]{0.49\textwidth} + \centering + \includegraphics[width=\textwidth]{assets/implementation/after-normalization.png} + \caption{After Normalization} + \label{fig:after-normalization} + \end{subfigure} + \hfill + \caption{Comparsion of a data point before and after the data normalization (self-composed)} +\end{figure} + +% After the normalization dataset is formated to +During the requirement engineering process it was found out even though RNN tends to perform better with time-series data, the convolutional autoencoders are very efficient at detecting anomalies from time-series data. So after the normalization step metric data is encoded into an image-like structure that can be inputted into a convolutional autoencoder. + +\begin{figure}[H] + \includegraphics[height=7cm]{assets/implementation/visualize-representation.png} + \caption{Visualization of encoded time series (self-composed)} + \label{fig:visualize-representation} +\end{figure} \ No newline at end of file diff --git a/documentation/PSPD/chapters/implementation/main.tex b/documentation/PSPD/chapters/implementation/main.tex new file mode 100644 index 0000000..78a75d0 --- /dev/null +++ b/documentation/PSPD/chapters/implementation/main.tex @@ -0,0 +1,10 @@ +\chapter{Initial Implementation} + +\input{chapters/implementation/chapter-overview} +\input{chapters/implementation/technology-selection} +\input{chapters/implementation/core-functionalities} +\input{chapters/implementation/self-reflection} +\input{chapters/implementation/chapter-summary} + + + diff --git a/documentation/PSPD/chapters/implementation/self-reflection.tex b/documentation/PSPD/chapters/implementation/self-reflection.tex new file mode 100644 index 0000000..f647eae --- /dev/null +++ b/documentation/PSPD/chapters/implementation/self-reflection.tex @@ -0,0 +1,36 @@ +\section{Self-Reflection} + +Even though the start of the implementation phase was a bit rocky, with time the author developed a deep understanding of all the tools and technologies required to build this project and manage to implement all the core functionalities to a high standard. One of the main goals the author has set up for themselves was to build this entire system be efficient as possible. Figure \ref{fig:service-benchmark} shows the resource usage of four services that a provisioned as part of this system while running under heavy load. Even then all four components combined don't use over 200MB of Memory and less than 0.1\% of the total CPUs available. + +\begin{figure}[H] + \includegraphics[width=15cm]{assets/implementation/service-benchmark.png} + \caption{Resource usage of Lazy-Koala components (self-composed)} + \label{fig:service-benchmark} +\end{figure} + +At the current stage of the project, all the functionality of \ac{gazer} is implemented properly and telemetry scraping works as expected (Refer Appendix \ref{appendix:prometheus-dashboard} for Prometheus dashboard). Although some of the functionality of the \ac{lazy-koala-operator} was left out due to the limited time, notably the \ac{lazy-koala-operator} doesn't provision an instance of \ac{sherlock} due to the fact that model delivery process hasn't been finalized. + +Initial results of \ac{sherlock} show that the author's hypothesis about the machine learning model architecture is feasible. Figure \ref{fig:normal-state} show encoded metric status while the system was stable and the reconstruction from \ac{sherlock} shows a similar output. This means reconstruction error is low hence the lower anomaly score. When it comes to \ref{fig:abnormal-state} there is a large difference between the input and output data frames. This is because the model hasn't seen metric levels like these and trying to stick to what it has been trained on. Hence the large reconstruction error. + +\begin{figure}[H] + \centering + \begin{subfigure}[b]{0.7\textwidth} + \centering + \includegraphics[width=\textwidth]{assets/implementation/normal-state.png} + \caption{Normal State} + \label{fig:normal-state} + \end{subfigure} + \hfill + \begin{subfigure}[b]{0.7\textwidth} + \centering + \includegraphics[width=\textwidth]{assets/implementation/abnormal-state.png} + \caption{Abnormal State} + \label{fig:abnormal-state} + \end{subfigure} + \hfill + \caption{Initial results of the \ac{sherlock} model (self-composed)} +\end{figure} + +\section{Video Demo} + +The explanation of the research and video demonstration of the project can be found from here \href{https://www.youtube.com/watch?v=d5C2qLK1rG8}{https://www.youtube.com/watch?v=d5C2qLK1rG8} \ No newline at end of file diff --git a/documentation/PSPD/chapters/implementation/technology-selection.tex b/documentation/PSPD/chapters/implementation/technology-selection.tex new file mode 100644 index 0000000..96bddea --- /dev/null +++ b/documentation/PSPD/chapters/implementation/technology-selection.tex @@ -0,0 +1,98 @@ +\section{Technology Selection} +\subsection{Technology Stack} + +\begin{figure}[H] + \includegraphics[width=16cm]{assets/implementation/technology-stack.png} + \caption{Technology stack (self-composed)} + \label{fig:technology-stack} +\end{figure} + +At a grasp, this seems like a lot of toolings for a project of undergraduate level but each and every tool and technology listed here provide a vital functionality to make the final prototype efficient and reliable as much as possible in order to stand through to the design goals listed above. + +\subsection{Programming Language} + +This project was built using five different programming languages which were chosen due to their unique features that are required from different components in the system. + +\begin{itemize} + \item \textbf{GoLang} - Go is a programming language invented by Google that took a lot of inspiration from C but with modern features like memory safety, automatic garbage collection, and built-in concurrency. Due to this nature Go was used to build Kubernetes itself and almost all the tooling around Kubernetes relies on as their language of choice. Since this project tries to extend some functionality of Kubernetes, it was highly recommended to use Go to built parts of the system that interface with Kubernetes tooling. + \item \textbf{Python} - Python is known as the go-to language for data science but in this project, Python had another critical role. There is a library called BCC which streamlines the connection to the Linux \ac{ebpf} API. So both data science and telemetry extraction components are built using Python. + \item \textbf{C} - Linux \ac{ebpf} API allows userspace applications to submit some sandboxed programs into kernel space at runtime and the kernel responsible for compiling and executing them along with kernel calls. But since this is a Linux kernel feature all the sandboxed programs need to be written in C so the kernel can understand it. + \item \textbf{TypeScript} - For frontend applications, programming languages are limited to Javascript and TypeScript. The TypeScript was chosen for this project since TypeScript offers a lot of compile-time checks which prevent accidental bugs from sneaking into the production application. + \item \textbf{Rust} - Out of all tools and technologies this may be the only replaceable technology. Rust will be used in the \ac{sherlock} module to interface with the machine learning model. Even though Python or even Go could be viable alternatives to this, Rust has the lowest memory and CPU footprint of any modern programming language. Since one of the design goals of this project is to have the lowest overhead possible author was settled on relying on Rust for this task. +\end{itemize} + + + +\subsection{Libraries Utilized} +Software libraries prevent software engineers from reinventing the wheel every time they want to perform some common functionality. This is done by empowering them with abstractions they can use to perform the task they wanted so they can focus on important things. To build this project a number of libraries were used both in UI and backend components. + + +\subsubsection{Frontend} +\begin{itemize} + \item \textbf{ReactJS} - Since this is a hosted application a web-based UI made more sense. In order to build a web-based frontend, there are a few common methods, it is possible to use a vanilla HTML stack and build everything from scratch but for this project, UIs needs to be interactive and that left the author with three viable choices, ReactJS, VueJS, and Svelte. Since the author was more familiar with ReactJS he chose to rely on it for the UI implementation. + \item \textbf{Mantine} - Mantine is a React component library which helps developers to use the prebuilt component like date pickers without having to code them from scratch. Viable alternatives for this is Boostrap, MaterialUI, and Ant Design. The author settled on Mantine due to its user-friendliness and rich TypeScript support. + \item \textbf{D3.js} - D3.js is a UI library that helps to create interactive graphs. This library was used to create the service topology graph. There weren't any alternatives for this that allowed to render directed graphs with interactivity. +\end{itemize} + + +\subsubsection{Backend} +\begin{itemize} + \item \textbf{Kubebuilder} - Kubebuilder is a SDK that is used to build Custom Kubernetes Operators. This is created and maintained by the Kubernetes special interest group themself along with the community support. The only viable alternative for this is called Operator SDK which is developed by the community with the support of RedHat. Since the author had prior experience with Kubebuilder and its inner working, it was decided to rely upon it. + \item \textbf{BCC} - BCC is a frontend to Linux \ac{ebpf} API which makes deploying kernel probes and retrieving data from them very easy. Other than relying on C interfaces this was the only viable solution. + \item \textbf{Pandas} - Pandas is a data manipulation and an analysis library that was used in both \ac{gazer} and \ac{sherlock} module. + \item \textbf{Numpy} - Is considered a holy grail when it comes to data science work in Python. Even libraries like Tensorflow and PyTorch rely on this for manipulation of multi-dimensional arrays and matrices. In the training phase of \ac{sherlock}, NumPy was heavily used to preprocess the datasets. + \item \textbf{PyTorch Lightning} - PyTorch Lightning is a layer of abstraction on top of the PyTorch library. The author chose to opt-into the PyTorch ecosystem rather than the Tensorflow ecosystem because, during the last 1-2 years Tensorflow got a lot of criticism to for being unoptimized for modern data science. The author also wanted some hands-on experience working with PyTorch since the industry seems to be moving towards PyTorch dominated era. +\end{itemize} + +\subsection{Persistence Storages} + +\begin{itemize} + \item \textbf{Prometheus} - Prometheus is a time series database that doubles as a data scraping agent. Prometheus uses the pull method in contrast to normal pushing methods to update the database. During the requirement engineering phase, it was discovered the vast majority of companies rely on Prometheus. So for this project author decided to rely on it as the primary database for storing metric data due to its popularity and well integration with Kubernetes. + \item \textbf{etcd} - etcd is a key-value data store which is built into the core of Kubernetes. The \ac{lazy-koala-operator} relies on this to both sync up the monitoring config with \ac{gazer} instances as well as keep track of all the monitored services. + \item \textbf{GCP Artifact Registry} - Artifact Registry stores the trained models and docker containers. Viable alternatives for this are Github Container Registry, DockerHub, and Azure Container Registry. GCP Artifact Registry was chosen for this due to it's competitive pricing and cutting-edge features like AI-powered vulnerability scanning. +\end{itemize} + +\subsection{Developer Tools Utilized} +\begin{itemize} + \item \textbf{Vite} - A Javascript build tool that can convert TypeScript code into well-optimized Javascript. Vite was selected over webpack due to its efficiency in build times. + \item \textbf{Docker} - A container management tool, which can pack software into a lightweight self-containing package that can be deployed in Kubernetes. + \item \textbf{Github Actions} - Automated CI/CD platform which is built into Github platform. + \item \textbf{VSCode} - Code editor used to create frontend UIs. + \item \textbf{PyCharm} - IDE used to develop \ac{gazer} and \ac{sherlock}. + \item \textbf{GoLand} - IDE used to develop the \ac{lazy-koala-operator}. + \item \textbf{Git} - Version control tool that was used to keep track of changes between releases. +\end{itemize} + +\subsection{Production Tools} +\begin{itemize} + \item \textbf{Tensorflow Servings} - Production grade machine learning model serving system written in C++ to be efficient as possible. + \item \textbf{Kubernetes} - Host the entire system along with a distributed system that's get monitored. +\end{itemize} + +\subsection{Summary of Technology Selection} + +\begin{longtable}{|p{43mm}|p{110mm}|} + \hline + \textbf{Component} & + \textbf{Tools/Technologies Used} \\ \hline + + \ac{lazy-koala-operator} & + Go, etcd, Kubebuilder, Kube-API, Controller Manager, GoLand \\ \hline + + \ac{gazer} & + C, Python, Prometheus, BCC, Pandas, \ac{ebpf}, Kube-API, PyCharm \\ \hline + + \ac{sherlock} - Training Phase & + Python, Prometheus, Pandas, Numpy, PyTorch Lighting, PyCharm \\ \hline + + \ac{sherlock} - Production & + Rust, Prometheus, Tensorflow Servings, Artifact Registry \\ \hline + + User Interface & + TypeScript, React, Mantine, D3.js, Vite, VSCode \\ \hline + + Common for all & + Docker, Github Actions, Git, Kubernetes \\ \hline + + \caption{Summary of technology selection (self-composed)} +\end{longtable} \ No newline at end of file diff --git a/documentation/PSPD/chapters/introduction/chapter-overview.tex b/documentation/PSPD/chapters/introduction/chapter-overview.tex new file mode 100644 index 0000000..8b0db46 --- /dev/null +++ b/documentation/PSPD/chapters/introduction/chapter-overview.tex @@ -0,0 +1,3 @@ +\section{Chapter Overview} + +The introduction chapter offers an overview of the entire project. First the author explains the problem domain of this research project, then move on to the specific issue this project is going to address, then the motivation behind this project and its objectives, and finally, concludes the chapter with the novelty of the research and expected research challenges. \ No newline at end of file diff --git a/documentation/PSPD/chapters/introduction/chapter-summary.tex b/documentation/PSPD/chapters/introduction/chapter-summary.tex new file mode 100644 index 0000000..a480da2 --- /dev/null +++ b/documentation/PSPD/chapters/introduction/chapter-summary.tex @@ -0,0 +1,4 @@ +\section{Chapter Summary} + +This chapter gave an overview on cloud computing on and how it matured over time along with one of the most pressing issue that's currently holding all the cloud native applications. Then the author described their plans to tackle this problem and listed down the how it can be achieved along with the unique challenges they have to overcome in order to complete this project. + diff --git a/documentation/PSPD/chapters/introduction/existing-work.tex b/documentation/PSPD/chapters/introduction/existing-work.tex new file mode 100644 index 0000000..2e881a5 --- /dev/null +++ b/documentation/PSPD/chapters/introduction/existing-work.tex @@ -0,0 +1,169 @@ +\section{Existing Work} + +\subsection{Anomaly detection} + +\begin{longtable}{| p{20mm} | p{43mm} | p{43mm} | p{43mm} |} +\hline + \textbf{Citation} & + \textbf{Technology summary} & + \textbf{Improvements} & + \textbf{Limitations} \\ \hline + \cite{du2018anomaly} & + Tested most of common machine learning methods to detect anomalies and benchmarked them & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Used SLIs to monitored data + \item A lot of good metrics (input data) + \item Performance monitoring of services and containers + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Only be able to identify predetermined issues + \item Require a sidecar that includes a lot of overhead + \item Won't work with event-driven architectures (this is where most of the new systems are headed) + \item Uses Supervised learning and it's near impossible to find real-world data with labels + \vspace{-7mm} + \end{itemize} \\ \hline + \cite{kumarage2018anomaly} & + The authors here are proposing a semi-supervised technique using a Variational Autoencoder to predict future time steps and calculate the difference between predicted and actual to detect anomalies. & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Due to the difficulty of finding labeled research data, they settled on using a semi-supervised technique. + \item Used randomized decision trees were utilized to select the most suitable features for each component. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item The model won't be easily transformable for other systems + \item If more new key features were added to the system it will require a total retraining + \vspace{-7mm} + \end{itemize} \\ \hline + \cite{kumarage2019generative} & + Uses a bidirectional \ac{gan} to predict future timesteps and uses MSE between prediction and real to determine the anomalies & + Experimented using a \ac{gan} to detect anomalies rather than using conventional autoencoders & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Accuracy is around 60\% which is not really good to use in production with mission-critical systems. + \item As this is a \ac{gan}-based system, it may take a lot of resources to run with production systems. + \end{itemize} \\ \hline + \caption{Comparison of anomaly detection methods in distributed systems (self-composed)} +\end{longtable} + +\subsection{Root cause identification} + +\begin{longtable}{| p{20mm} | p{43mm} | p{43mm} | p{43mm} |} +\hline + \textbf{Citation} & + \textbf{Technology summary} & + \textbf{Improvements} & + \textbf{Limitations} \\ \hline + \cite{gonzalez2017root} & + Detect failures in networks, using machine learning to generate knowledge graphs on historical data & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Build a predictable system + \item Automatic identification of dependencies between system events + \item Doesn't Need to rely on Domain experts + \item Generalized to different systems + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Limited to network issues + \item Even though the knowledge graph helped with visualization of the problem but still, people have to manually figure out what went wrong + \vspace{-7mm} + \end{itemize} \\ \hline + \cite{chigurupati2017root} & + Proposed a way to detect Hardware failures in servers using a probabilistic graphical model which concisely describes the relationship between many random variables and their conditional independence & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Find hidden meaning in values that seems random + \item Used a probabilistic approach to better understand the relationship between inputs and outputs + \item Gives all the possible root cause to a given problem + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Limited to hardware issues + \item Require support from domain experts + \item Can't account for unforeseen error + \vspace{-7mm} + \end{itemize} \\ \hline + \cite{samir2019dla} & + This detects and locates the anomalous behavior of microservices based on the observed response time using a \ac{hhmm} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Custom HHMM model + \item Self-healing mechanism + \item Focus on performance detection and identification at the container, node, and microservice level + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Input dataset scale is limited + \item Require a sidecar + \item Needs to predetermined thresholds + \vspace{-7mm} + \end{itemize} \\ \hline + \cite{wu2020microrca} & + Find Performance bottlenecks in distributed systems using an attribute graph to find anomaly propagation across services and machines & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Created a custom Faults Injection module + \item Uses an attribute graph to localize to faulty service + \item Application-agnostic by using a service mesh + \item Rely on service mesh to determine network topology + \item Uses unsupervised learning + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Only able to identify 3 types of issues + \item Looks only for performance anomalies + \item Use the slow response time of a microservice as the definition of an anomaly + \item Service meshes add a lot of overhead to systems + \item Required direct connection between services + \vspace{-7mm} + \end{itemize} \\ \hline + \caption{Comparison of root cause identification methods in distributed systems (self-composed)} +\end{longtable} + +\subsection{Commercial products} + +\begin{longtable}{| p{40mm} | p{55mm} | p{55mm} |} +\hline + \textbf{Name} & + \textbf{Futures} & + \textbf{Limitations} \\ \hline + Applied Intelligence by New Relic & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Metric forecasting. + \item Anomaly detection. + \item Alert grouping to reduce noise. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Lack of explainability for certain classifications. + \item All the telemetry data need to be sent to a third party. + \vspace{-7mm} + \end{itemize} \\ \hline + Watchdog by Datadog & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Monitor the metric data of the entire system from the background. + \item Monitor logging data. + \item Highlight relevant components affected by an issue. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Announced in 2018 but is still at private beta. + \item Require code changes and tight integration with datadog platform. + \item Available demos about the system seems to be engineered for demonstration purposes. + \vspace{-7mm} + \end{itemize} \\ \hline + \caption{Comparison of commercial products for root cause analysis (self-composed)} +\end{longtable} \ No newline at end of file diff --git a/documentation/PSPD/chapters/introduction/main.tex b/documentation/PSPD/chapters/introduction/main.tex new file mode 100644 index 0000000..3c7a2ef --- /dev/null +++ b/documentation/PSPD/chapters/introduction/main.tex @@ -0,0 +1,15 @@ +\chapter{Introduction} +\pagenumbering{arabic} +\input{chapters/introduction/chapter-overview} +\input{chapters/introduction/problem-background} +\input{chapters/introduction/problem-definition} +% \input{chapters/introduction/research-motivation} +% \input{chapters/introduction/existing-work} +\input{chapters/introduction/research-aim} +\input{chapters/introduction/research-objectives} +\input{chapters/introduction/research-novelty} +% \input{chapters/introduction/research-contribution} +\input{chapters/introduction/research-challenge} +% \input{chapters/introduction/research-question} +% \input{chapters/introduction/project-scope} +\input{chapters/introduction/chapter-summary} \ No newline at end of file diff --git a/documentation/PSPD/chapters/introduction/problem-background.tex b/documentation/PSPD/chapters/introduction/problem-background.tex new file mode 100644 index 0000000..bdd4a87 --- /dev/null +++ b/documentation/PSPD/chapters/introduction/problem-background.tex @@ -0,0 +1,16 @@ +\section{Problem Domain} + +\subsection{Cloud Computing} +With an emergence of Infrastructure as a Services (IaaS) like Amazon Web Services (AWS) and Google Cloud Platform (GCP) there is a big surge in organizations trying to outsource their computing needs to third parties \citep{rimol_2021}. This is mainly due to the elasticity given by all the cloud providers. Users can easily scale up and down their infrastructures within minutes without making any commitment. All the major providers bill users on "what you use is what you pay model". Since the cloud provider manages all the underlying infrastructure, users doesn't have to worry about problems like hardware failures. In contrast in a self-hosted setting if the user wanted one extra GB of memory than what's available it requires a lot of effort and it cost a lot to full fill that requirement. + +\subsection{Cloud-Native Applications} +During the 90s and early 2000s, all the applications were made as a big monolith from a single code base \citep{LessonsF52:online}. Most of them were shipped as a single binary. Since those days applications were fairly simple this worked very well with little to no downsides. When the 2010s came around there were a lot of specialized frameworks and programming languages and marketing teams wanted a lot of new futures quickly developed still maintaining reliability \citep{di2018migrating,Microser52:online}. But if the code base of the application was stored in a single repository, developers have to go through a long process to review and test if the changes won't break the current systems. Developers are also limited by the frameworks and programming languages that were chosen for the project at the start. + +To tackle these problems a new way to develop applications was introduced, it's called "Microservices". The idea behind this concept is to break all the functionalities of big monolithic applications into small individually scalable services and give ownership of each service to small teams of people who work separately. With this flow developers are free to use whatever tool they like to develop each service. Because these services are developed parallelly by different teams this increases the development velocity by order of magnitude \citep{Understa56:online}. + +As these services are relatively small and tailor-made to run on cloud environments it's easier to take something that's running on the developer's local machine to the production cluster in a matter of minutes. This is mainly thanks to modern cloud-native tools like CI/CD pipelines which automatically build and test the code for them, which can save a lot of time spent just doing repetitive tasks which are prone to human errors \citep{Whataret68:online}. + +\subsection{Monitoring Cloud-Native Applications} \label{monitoring-bg} +Even though cloud-native applications have a lot to offer when it comes to developer velocity and productivity, It has its fair share of issues. Most of these problems are linked to the sheer complexity of these systems and not having a proper way to monitor them \citep{5WaysYou35:online}. All 3 major cloud providers provide a way to monitor these applications efficiently and there are some great open-source projects do this well, But to take full advantage of those systems, developers have to adapt their services to export all the vitals in a way the monitoring system understand. This works for the most part and this is what all the big companies are doing, even if it takes more developer time to in the end it's very crucial when it comes to disaster recovery. + +But there is still a slight problem with this approach. Once the system starts to scale up to hundreds of services, the number vitals that has to be monitored goes to thousands and will require a lot of additional \acp{sres} and will have drop to lot of non-crucial service vitals and derive abstract \acp{sli} to make it \textbf{humanly} possible to understand what's going on.\\ diff --git a/documentation/PSPD/chapters/introduction/problem-definition.tex b/documentation/PSPD/chapters/introduction/problem-definition.tex new file mode 100644 index 0000000..5b6696a --- /dev/null +++ b/documentation/PSPD/chapters/introduction/problem-definition.tex @@ -0,0 +1,9 @@ +\section{Problem Definition} + +One of the main problems in monitoring microservices is the sheer number of data they generate. It's humanly impossible to monitor the metrics of all the services and it's hard for a single person to understand the entire system. To overcome this, \acp{sres} use abstracted metrics called \acp{sli} which measure the quality of the service at a higher level. Even though \acp{sli} can alert when there is an issue in the system, it's very hard to understand where the actual problem is from that along. To understand the root cause of the problem, \acp{sres} need to dig into \acp{apm} of all the services and go through the logs of each of the troubling services. + +When the system consists of hundreds or thousands of services that are interdepended. It's really hard to find where the actual issue is coming from and it may require the attention from all the service owners of the failing services to go through the logs and \acp{apm} to identify the actual root cause of the failure. This could greatly increase the \ac{mttr} and waste a lot of developer time just looking at logs. + +\subsection{Problem Statement} + +Modern distributed systems are becoming large and complex to a point where, when a failure happens it requires the collaboration with a large number of people to find the actual root cause. Implementing a machine learning model which will watch over all the services and reacts to anomalies in real-time could greatly reduce the \ac{mttr}. \ No newline at end of file diff --git a/documentation/PSPD/chapters/introduction/project-scope.tex b/documentation/PSPD/chapters/introduction/project-scope.tex new file mode 100644 index 0000000..3b50ef0 --- /dev/null +++ b/documentation/PSPD/chapters/introduction/project-scope.tex @@ -0,0 +1,50 @@ +\section{Project Scope} + +From the literature survey and talking with industry, experts author found many issues they can address when developing the system, but some of those problems like interpretability on autoencoder \citep{ribeiro2016should} are hard to solve by someone at a level of an undergraduate. As this project is done by one developer in less than one year, it won't be possible to create a fully functional monitoring platform like Datadog or New Relic. The focus of this project is to see if the author can develop a single model that can monitor all kinds of services after transfer learning with few examples. \\ + + +\subsection{In-scope} \label{sec:in-scope} +Following are the main focuses of this project +\begin{itemize}[noitemsep,nolistsep] + \item Evaluation Framework + \begin{itemize}[noitemsep,nolistsep] + \item Ability to create service mesh out using Kubernetes native resources. + \item Each service has the ability to simulator predefined error types. + \item Service mesh can be made up of services written in different programming languages and frameworks. + \item Built-in method to run stress tests. + \end{itemize} + \item Monitoring System + \begin{itemize}[noitemsep,nolistsep] + \item Low overhead data collection pipeline to collect service telemetry. + \item Reliability system which generate fewer false positives so it won't overwhelm the operators and false negatives will be caught by the main monitoring system. + \item Optimized models to have fairly small memory footprint and a CPU overhead. + \item Well generalized model which will be able to deploy with completely new services and it will learn to adapt the new system. + \end{itemize} +\end{itemize} + + +\subsection{Out-scope} \label{sec:out-scope} +Follow will not be covered during this project +\begin{itemize}[noitemsep,nolistsep] + \item Evaluation Framework + \begin{itemize}[noitemsep,nolistsep] + \item Support for every major language and framework. + \item Working outside of Kubernetes eco-system. + \end{itemize} + \item Monitoring System + \begin{itemize}[noitemsep,nolistsep] + \item Interpretability - Describing a behavior of autoencoder is a difficult task that won't be covered during the project. + \item System won't be trained against data from a real production system due to the lack of public datasets. + \item System won't have very high accuracy, as this will be the first line of defense this will try to avoid false positives to prevent adding more noise to alerting systems. + \item Automatically identify system topology. + \item This will not be a drop-in replacement for existing monitoring systems, rather this will work with existing monitoring systems to reduce the \ac{mttr}. + \end{itemize} +\end{itemize} + +\subsection{Prototype Feature Diagram} +\begin{figure}[H] + \centering + \includegraphics[width=16cm]{assets/introduction/High-level-system-diagram.png} + \caption{Prototype feature diagram (self composed)} + \label{fig:high-level-diagram} +\end{figure} \ No newline at end of file diff --git a/documentation/PSPD/chapters/introduction/research-aim.tex b/documentation/PSPD/chapters/introduction/research-aim.tex new file mode 100644 index 0000000..74dc834 --- /dev/null +++ b/documentation/PSPD/chapters/introduction/research-aim.tex @@ -0,0 +1,5 @@ +\section{Research Aim} + +\textit{The aim of this research is to design, develop and evaluate a toolkit to help system operators to reduce the MTTR when the system is experiencing an anomaly. This will be achieved by using machine learning models to investigate all the services in the system and highlighting the most probable root causes in order, So the operators don’t have to find a needle in a haystack.} + +In this project the author tries to create a single model that can monitor all the vitals of a given service and output an anomaly score in any given time window. The author is hoping to make it generalized enough so operators can take the same model and deploy it with other services and the model will adopt to the new services using a few-shot learning method \citep{wang2020generalizing}. To do this, author is trying to create a data encoding technique to represent monitoring data in a programming language/framework independent way. To achieve this goal the author is also hoping to create a lightweight service instrumentation pipeline that can collect and process telemetry data in real-time without requiring any additional work from the user's end. \ No newline at end of file diff --git a/documentation/PSPD/chapters/introduction/research-challenge.tex b/documentation/PSPD/chapters/introduction/research-challenge.tex new file mode 100644 index 0000000..f0407e8 --- /dev/null +++ b/documentation/PSPD/chapters/introduction/research-challenge.tex @@ -0,0 +1,16 @@ +\section{Research Challenge \& Potential} + +% Even though this project seems very straightforward and easy to implement from a high level, it becomes problematic when it comes to reaching targets the author has set for himself. For an example, interpretability was one of the most requested features from the industry experts and a must-have trait for mission-critical systems \citep{ribeiro2016should}. But it was left out of the project scope due to its complexity especially when it comes to an undergraduate level project. Other than that following are a few of the more difficult challenges the author is expected to face while conducting the research. + +\subsection{Research Challenge} + +\begin{itemize}[leftmargin=*] + \item \textbf{Highly seasonal and noisy patterns} - Monitoring metrics on microservices on production tends to have very unpredictable patterns depending on the traffic that has been sent to the service. The amount of traffic sent will depend on several external factors that are hard to determine. Modeling both temporal dependencies and static interdependencies found in telemetry data of services into a single graph will be very difficult and require a lot of fine-tuning and data engineering skills. + \item \textbf{Overhead} - Modern deep learning models can solve any problem if we could give them an unlimited amount of data and processing power. Although in this case, the models need to optimize for efficiency over accuracy since having a monitoring system that consumes a lot more resources than the actual target system isn't effective. + \item \textbf{Fit into Kubernetes eco-system} - Kubernetes has become the de-facto standard to managing distributed systems \citep{WhatisCo78:online}. So the author is planning to create a Kubernetes extension that will bridge the connection between monitored service and monitoring model. But Kubernetes itself has a very steep learning curve, even the original developers themselves have admitted, Kubernetes is too hard and complex for beginners \citep{Googlead4:online}. + \item \textbf{Extraction of Telemetry} - Even though it's considered the best practice to implement telemetry exporting methods in the development phase of any application, developers often skip this part to save time. Sometimes it's required to depend on external applications that are developed by third parties which don't have means of exporting telemetry. When building an end-to-end root course indication platform, it's required to take these kinds of scenarios into account as well. +\end{itemize} + +\subsection{Research Potential} + +Initial feedback received for this project has been very positive due to the fact that this is a very common yet still unsolved issue in reliability engineering. Since this project is developed as a set of loosely coupled components, some of the experts expressed their interest in using individual components to solve some of the other problems they have been experiencing over time. Finally, this project can be used as a starting-off position for future researches which are focusing on specific areas of \ac{aiops} by replacing individual components of this with their owns. \ No newline at end of file diff --git a/documentation/PSPD/chapters/introduction/research-contribution.tex b/documentation/PSPD/chapters/introduction/research-contribution.tex new file mode 100644 index 0000000..a8662bf --- /dev/null +++ b/documentation/PSPD/chapters/introduction/research-contribution.tex @@ -0,0 +1,10 @@ +\section{Research Contribution} + + +\subsection{Domain Contribution} + +With this research, the author first tries to develop a \textbf{cloud-native solution to create a configurable microservices system}, So this research and future researches will have a standard environment to develop and evaluate their work. The author also hopes to build a lightweight and \textbf{low-overhead data collection pipeline} using \ac{ebpf} to collect telemetry of target services without any instrumentation from the user. + +\subsection{Knowledge Contribution} + +One of the main problems with monitoring microservices systems is different services can be developed with different programming languages and frameworks and those can contain different levels of noisiness\label{need-for-encoding}. So it's hard for a single model to detect anomalies in any service since some frameworks tend to use more resources while idle than others. So to address this author is trying to come up with an \textbf{encoding method} so the model can be trained to monitor one framework and those learning will still be valid for another framework. With those encoded data the author is hoping to develop a \textbf{convolutional autoencoder that will use unsupervised learning to spot out anomalies in a given data stream}. This may have better performance while using fewer resources convolutional layers are typically lightweight and good at pattern recognition \citep{oord2016wavenet}. Finally, the author is planning to aggregate those predictions from the models into a pre-generated service graph and weigh it to \textbf{find all possible root causes}. diff --git a/documentation/PSPD/chapters/introduction/research-gap.tex b/documentation/PSPD/chapters/introduction/research-gap.tex new file mode 100644 index 0000000..5ef5c90 --- /dev/null +++ b/documentation/PSPD/chapters/introduction/research-gap.tex @@ -0,0 +1,7 @@ +\section{Research Gap} + +After a literature survey author came conclusion finding a root cause of any failure within a distributed system is a very difficult issue due to it not having single output we can try to predict and most researchers have built their own simulation of a distributed system by themselves since there isn't any open dataset about monitoring data mainly because it could contain sensitive information. + +Most currently established researches are done towards creating statistical models like clustering and linear regression. Even though these algorithms perform very well in small-scale systems, they struggle to keep up when the monitoring data become very noisy with scale. Another problem none of these papers properly addressed was constant changes to services. All most published research considers target services as static but in reality, these services can change even many times per day \citep{GoingtoM51:online}. + +After talking with industry experts author concluded three main issues all had with using a machine learning model as monitoring agent Reliability, Interpretability, and Tunability. On reliability, experts said too many false positives will make operators lose faith in the system because it's gonna be another distraction to them. As the operators have to take critical decisions with the output of these models, it has been interpretable by humans \citep{ribeiro2016should}. Finally, this system should act more like a tool rather than a replacement to human operators, because no matter machine learning models cannot compete with the context a human can handle. diff --git a/documentation/PSPD/chapters/introduction/research-motivation.tex b/documentation/PSPD/chapters/introduction/research-motivation.tex new file mode 100644 index 0000000..aaec51d --- /dev/null +++ b/documentation/PSPD/chapters/introduction/research-motivation.tex @@ -0,0 +1,3 @@ +\section{Research Motivation} + +Modern distributed systems generate tons of useful and not so useful telemetry data. As the system grows in demand and size, these telemetry data only get nosier and complex \citep{Untangli35:online}. It's difficult for humans to make sense of all these data, especially if they don't have a lot of years of experience with the system. In the other hand, deep learning models thrive when it has a lot of data to learn from. As these models can be trained in computer-simulated environments they can learn concepts humans takes years to grasp within days \citep{OpenAI_dota, silver2017mastering}. Finally, unlike humans a deep learning model can monitor a service 24x7 without taking any breaks which will not only prevent outages even before they happen, It could be reduced \ac{mttr} because the issue can be detected way earlier than any human could do. \ No newline at end of file diff --git a/documentation/PSPD/chapters/introduction/research-novelty.tex b/documentation/PSPD/chapters/introduction/research-novelty.tex new file mode 100644 index 0000000..16ce162 --- /dev/null +++ b/documentation/PSPD/chapters/introduction/research-novelty.tex @@ -0,0 +1,14 @@ +\section{The Novelty of the Research} + +\subsection{Problem Novelty} + +After a literature survey, the author concluded that finding the root cause of any failure within a distributed system is a challenging issue. This is mainly due to the fact that this problem can't be mapped to a fixed set of inputs and outputs, which is a basic requirement for almost all types of neural networks that are readily available. + +% Furthermore, almost all the researchers working on this problem domain have built their own solution for simulating a distributed system, since there isn't any open dataset on service monitoring. This could be mainly due to the fact that these datasets could contain sensitive information. + +Most of the currently established research was done towards creating statistical models like clustering and linear regression. Even though these algorithms perform very well in small-scale systems, they can struggle to keep up with the large-scale, noisy monitoring data that are found in medium to large size systems. Another problem that was recognized was none of these papers properly addressed the issue of constant changes in services. Most published research considers target services as static but in reality, these services can change, even many times per day \citep{GoingtoM51:online}. + +\subsection{Solution Novelty} + + +The focus of this project is to create an adaptable and scalable series of components that ranges from instrumentation to root cause analysis, which can be well integrated into an existing system or can be extended to fit newer use cases. To achieve this the author is utilizing a fairly new technique called \ac{ebpf} for instrumentation, which is a Linux kernel API that can be used to track kernel events such as TCP socket changes to understand the network layer of each application running on the system. Finally, for anomaly detection, a convolutional autoencoder with a novel data encoding method was used to keep the system as lightweight as possible, while still having acceptable accuracies for classifications. Combining that with a weighted graph generated from collected network activity data can be used to highlight to blast radius of an anomaly along with possible causes. \ No newline at end of file diff --git a/documentation/PSPD/chapters/introduction/research-objectives.tex b/documentation/PSPD/chapters/introduction/research-objectives.tex new file mode 100644 index 0000000..74b0b1f --- /dev/null +++ b/documentation/PSPD/chapters/introduction/research-objectives.tex @@ -0,0 +1,141 @@ + +\newpage +\section{Research Objectives} + +\newcommand\robProblemIdentification{ +When selecting the problem the author wanted to pursue, they had 3 main goals. +\begin{enumerate}[leftmargin=*,noitemsep,nolistsep,label=RO\arabic*:] + \item The problem domain should be something they would enjoy working in. + \item At the end of the research they should have done a meaningful impact to the target domain, both in the theoretical and practical aspect. + \item It should be challenging to achieve and results should speak for themselves. + \vspace{-7mm} +\end{enumerate} +} + +\newcommand\robLiteratureReview{ +Conduct a Literature review on root cause analysis, +\begin{enumerate}[leftmargin=*,noitemsep,nolistsep,label=RO\arabic*:] + \setcounter{enumi}{3} + \item To find the current methods that are used for anomaly detection and root cause localization. + \item Uncover issues with the current approaches. + \item Understand how advancements in other related domains can be apply to this domain. + \vspace{-7mm} +\end{enumerate} +} + + +\newcommand\robDevelopingEvaluation{ +During the literature survey, one of the problem the author identified was there isn’t a uniform dataset when it comes to training or evaluating models to detect anomalies in microservices. Most of the researchers used private datasets to train and test their work. +To address this author is developing, +\begin{enumerate}[leftmargin=*,noitemsep,nolistsep,label=RO\arabic*:] + \setcounter{enumi}{5} + \item A tool that can easily simulate a distributed system in a cloud-native setting. + \item A tool that can inject anomalies into the running services. + \vspace{-7mm} +\end{enumerate} +} + +\newcommand\robPublishPlayground{ +The author is hoping to publish a paper about the above-mentioned tool so the future researchers will have a unified way to train, test, and benchmark their system without having to reinvent the wheel again and again. +} + +\newcommand\robDataGathering{ +In order to create a model to detect anomalies, the author will, +\begin{enumerate}[leftmargin=15mm,noitemsep,nolistsep,label=RO\arabic*:] + \setcounter{enumi}{7} + \item Simulate a distributed system. + \item Simulate the traffic inside the system. + \item Collect monitoring data while it's running. + \vspace{-7mm} +\end{enumerate} +} + +\newcommand\robDevelopingEncoding{ +Since these microservices will report very different metric values even at idle depending on the architecture of the service. To normalize theses data points from all the services to one format author will, +\begin{enumerate}[leftmargin=15mm,noitemsep,nolistsep,label=RO\arabic*:] + \setcounter{enumi}{10} + \item Evaluate current data encoding methods like \cite{zhang2019deep}. + \item Find the best one to fit and optimize it to this use case. + \item Test if there is an improvement by using that method. + \vspace{-7mm} +\end{enumerate} +} + + +\newcommand\robDevelopingModel{ +According to \cite{kumarage2019generative} autoencoders tend to perform best when it comes to anomaly detection. But during the literature survey it was revealed that convolution autoencoders weren't tested for this usecase. So the author is hoping to develop a convolution autoencoders and test how it will perform. +} + + +\newcommand\robTesting{ +Following things will be tested during the testing phase, +\begin{enumerate}[leftmargin=15mm,noitemsep,nolistsep,label=RO\arabic*:] + \setcounter{enumi}{13} + \item How will the system classify long-term \& short-term fluctuations. + \item What will be the overhead of the system. + \item Can the system understand the mapping between core metrics like CPU and Memory usages. + \item Accuracy of fault detection. + \item Reliability of the instrumentation system. +\vspace{-7mm} +\end{enumerate} +} + +\newcommand\robIntegration{ +Having a fancy model doesn’t add means anything if it’s very hard to use with a real system. So the author is hoping to develop a Kubernetes extension that will map the model with any service given by the user. +} + + +\begin{longtable}{|p{20mm}|p{90mm}|p{19mm}|p{17mm}|} +\hline + \textbf{Research Objectives} & + \textbf{Explanation} & + \textbf{Learning Outcomes} & + \textbf{Research Questions} \\ \hline + + Problem identification & + \robProblemIdentification & + LO1 & + RQ1 \\ \hline + + Literature review & + \robLiteratureReview & + LO3, LO4, LO6 & + RQ1, RQ2, RQ3, RQ4 \\ \hline + + Developing an evaluation framework & + \robDevelopingEvaluation & + LO7 & + RQ4 \\ \hline + + Publish a paper about that playground & + \robPublishPlayground & + LO7 & + N/A \\ \hline + + Data gathering and analysis & + \robDataGathering & + LO7 & + RQ2, RQ4 \\ \hline + + Developing encoding method & + \robDevelopingEncoding & + LO2, LO5, LO7 & + RQ2 \\ \hline + + Developing the model & + \robDevelopingModel & + LO2, LO5, LO7 & + RQ3 \\ \hline + + Testing and evaluation & + \robTesting & + LO8, LO9 & + RQ4 \\ \hline + + Integration & + \robIntegration & + LO7 & + RQ1 \\ \hline + +\caption{Research objectives (self-composed)} +\end{longtable} diff --git a/documentation/PSPD/chapters/introduction/research-question.tex b/documentation/PSPD/chapters/introduction/research-question.tex new file mode 100644 index 0000000..23ec8a2 --- /dev/null +++ b/documentation/PSPD/chapters/introduction/research-question.tex @@ -0,0 +1,17 @@ + +\section{Research Question} + + +\begin{enumerate}[leftmargin=*,label=\textbf{RQ\arabic*:}] + +\item How can a machine learning model improve \ac{mttr} in a distributed system? + +\item What is the most efficient way to present raw data monitoring to machine learning model? + +\item What will be the most ideal machine learning model to uncover anomalies in a microservice? + +\item What are the methods that can be used to evaluate a root cause prediction system? + +\end{enumerate} + + diff --git a/documentation/PSPD/chapters/literature-review/approach.tex b/documentation/PSPD/chapters/literature-review/approach.tex new file mode 100644 index 0000000..c660b4c --- /dev/null +++ b/documentation/PSPD/chapters/literature-review/approach.tex @@ -0,0 +1,283 @@ +\section{Technologies} + +\subsection{Monitoring Techniques} + +\subsubsection{Process Monitor} + +One of the most basic forms of monitoring application is called system monitor \citep{WhatisaS27:online}, where the target application is monitored using the operating system's process manager which keeps track of resource usages from each of the running processes. This works well for small monolithic applications where typically there is only one application per server. + +\subsubsection{Hypervisor Based Monitor} + +Around the time of the early 2000s, a new way to manage servers called virtualization got popularized \citep{Whatisvi12:online}. The idea behind this was to have one big server split into smaller isolated \ac{vm} which made sharing and managing hardware resources easier. To achieve this host computer runs software called Hypervisor which can create and delete guest \acp{vm} on demand \citep{Mergen_Uhlig_Krieger_Xenidis_2006}. As the hypervisor is responsible for managing the \ac{vm}'s resource requests, it can be used to observe the resource usage. Most cloud providers still rely on hypervisors to create \acp{vm} to sell to customers \citep{7waysweh13:online}. One of the most commonly used hypervisor is called Kernel-based Virtual Machine (KVM) which uses the Linux kernel itself as the hypervisor, Which has a very small overhead on the host machine and grants better visibility into the guest system \citep{kivity2007kvm}. + +\subsubsection{Service Mesh} \label{sec:service-mesh} + +When the containerized distributed systems started getting popular both developers and operators were required to gather more insights about each application. Manually programming each application to collect telemetry is a time-consuming task and developers were reluctant to implement these. To overcome this challenge Service Meshes was introduced \citep{li2019service}. The idea behind this is to keep target service as it is and build a wrapper around it which will do all the instrumentation for the service. This is usually implemented as a side-car proxy \citep{Whatissi48:online}. + +\begin{figure}[H] + \includegraphics[width=16cm]{assets/literature-review/sidecar-proxy.png} + \caption{Sidecar proxy design pattern \citep{Whatissi48:online}} + \label{fig:sidecar-proxy} + % https://cdn.ttgtmedia.com/rms/onlineimages/whatis-sidecar_proxy.png +\end{figure} + +As this proxy sitting outside of the service, it is language-agnostic and intercepts all the inbound and outbound traffic at the application layer and relays them to responsible parties. This is a very effective method to provide a lot of visibility into services without requiring any additional work from the developers. The main problem with this approach is it adds a lot of both network and CPU overhead to analyze each of the requests with proxy \citep{Benchmar93:online}. + +\begin{figure}[H] + \includegraphics[width=11cm]{assets/literature-review/linkerd-benchmark.png} + \caption{Service mesh benchmark \citep{Benchmar93:online}} + \label{fig:linkerd-benchmark} + % https://linkerd.io/images/benchmark/latency-20rps.png +\end{figure} + +\subsubsection{Extended Berkeley Packet Filter (eBPF)} + +\ac{ebpf} is a feature introduced to the Linux kernel in version 3.15 that allows deploying sandboxed programs to kernel space at run-time which could be used for application instrumentation from the kernel level \citep{LKMLIngo52:online}. \ac{ebpf} essentially created a way to run hooks on kernel events. For example kernel method "tcp\_v4\_syn\_recv\_sock" gets called every time a client wants to establish a TCP connection with the server. With \ac{ebpf} users can deploy a lightweight hook that is called every time a new connection is made which will update the state on \ac{ebpf} map which could be read from user-space. + +\begin{figure}[H] + \includegraphics[width=11cm]{assets/literature-review/ebpf-architecture.png} + \caption{eBPF architecture \citep{WhatiseB46:online}} + \label{fig:ebpf-architecture} + % https://ebpf.io/static/map\_architecture-e7909dc59d2b139b77f901fce04f60a1.png +\end{figure} + +Using this data the request rate of a given service can be calculated with minimum overhead and zero instrumentation on the application. The main drawback of this method is it's very difficult to capture high-level data like HTTP status codes since those data only exist at the application level as it is. + +\begin{longtable}{| p{23mm} | p{42mm} | p{42mm} | p{42mm} |} +\hline + \textbf{Technique} & + \textbf{Advantage} & + \textbf{Disadvantage} & + \textbf{Citations} \\ \hline + + Process Monitor & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item Virtually no overhead. + \item Works out of the box. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item A very limited number of data points. + \item Not suited for a distributed system. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item \cite{chigurupati2017root} + \item \cite{kumarage2018anomaly} + \item \cite{kumarage2019generative} + \vspace{-7mm} + \end{itemize} \\ \hline + + Hypervisor Based Monitor & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item Most of the could providers gives simple API to access this data. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item A limited number of data points + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item \cite{du2018anomaly} + \item \cite{geethika2019anomaly} + \vspace{-7mm} + \end{itemize} \\ \hline + + Service Mesh & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item In-depth monitoring + \item Request analyzing and modification at fly + \item Framework Independent + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item Performance Overhead + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item \cite{samir2019dla} + \item \cite{wu2020microrca} + \vspace{-7mm} + \end{itemize} \\ \hline + + Extended Berkeley Packet Filter & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item Very low overhead + \item Works at the kernel level + \item Able to scrape any data point related to the system + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item Works only on Linux-based systems + \item Difficult to develop and use + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item None + \vspace{-7mm} + \end{itemize} \\ \hline + + \caption{Comparison of instrumentation methods (self-composed)} +\end{longtable} + + +\subsection{Detecting Anomalies} + +\subsubsection{Supervised Learning}\label{sec:approch-supervised} +The most popular way to detect anomalies, in general, is using supervised learning methods. From finding outliers in sales patterns to fraud detection supervised learning methods can be implemented. \cite{du2018anomaly} mentioned even in the cloud computing domain still, more than half of the methods are relies on supervised learning to detect anomalies. Among these Support Vector Machines (SVMs), Random Forest and Decision Trees were used the most. But one of the main downsides to using supervised learning in cloud computing is the lack of labeled anomalous data. Since most of the systems nowadays target at least 99\% of uptime finding labeled data is difficult. Even if there is a well-balanced dataset, the trained model won't be able to recognize unforeseen anomalies. + +\subsubsection{Semi-Supervised Learning} + +As mentioned in \ref{sec:approch-supervised} one of the most challenging issues with anomaly detection is to find a dataset with enough labeled abnormal. One of the key contributing factors to developing a well-generalized model is having a well-balanced dataset \citep{batista2004study}. If the authors managed to find a sizable unbalanced dataset, using clustering algorithms like K-nearest Neighbors (KNN) could yield better results. \cite{akcay2018ganomaly} managed to utilized an encoder-decoder-encoder architectural model for detecting anomalies in image data and achieved remarkable results. + +\subsubsection{Unsupervised Learning} + +When the target dataset consists of a lot of unlabeled data and it's difficult to label by hand, machine learning experts lean towards using unsupervised learning so the model can be its own teacher \citep{Unsuperv29:online}. \cite{silver2016mastering} managed to develop the first AI that was able to beat the best Go player in the world with a score of 4-1. This model learns to play the game of Go by look at thousands of games played by humans and learning to approximate the optimal strategy to any given board position. One year later same authors released the updated version of the model which learn to play Go without any human interference and this model beat the previously published model 100-0 \citep{silver2017mastering}. This proves deep learning models could even surpass humans when it comes to finding patterns in very large distributions. + +\begin{longtable}{| p{23mm} | p{42mm} | p{42mm} | p{42mm} |} +\hline + \textbf{Technique} & + \textbf{Advantage} & + \textbf{Disadvantage} & + \textbf{Citations} \\ \hline + + Supervised Learning & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item Easy to develop and train. + \item Models will converge better to the dataset. + \item Easy to test the model performance. + \item Ideal used for classification and regression problems. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item Require a labeled dataset. + \item The models won’t look at scenarios outside of the dataset. + \item The model will be biased if the labeled dataset was biased. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item \cite{du2018anomaly} + \vspace{-7mm} + \end{itemize} \\ \hline + + Semi-Supervised Learning & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item A blend of both Supervised and Unsupervised learning. + \item Developers can force the model to learn some behaviors. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item Require a labeled dataset to train the initial steps. + \item Model isn’t free to understand the problem from the ground up. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item \cite{akcay2018ganomaly} + \vspace{-7mm} + \end{itemize} \\ \hline + + Unsupervised Learning & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item Doesn’t require a labeled dataset. + \item Excel at clustering extracting patterns from datasets. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item Developers have no control over the model’s behavior. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item \cite{kumarage2018anomaly} + \item \cite{zhang2019deep} + \item \cite{kumarage2019generative} + \item \cite{khoshnevisan2019rsm} + \vspace{-7mm} + \end{itemize} \\ \hline + \caption{Comparison of anomaly detect methods in distributed systems (self-composed)} +\end{longtable} + +\subsection{Root Cause Identification} + +It's very difficult to use a standard learning algorithms like Multilayer Perceptrons (MLP) to predict faulty service because the number of microservices in distributed systems changes frequently. Even if the system were to retrain the model after every new deployment it will hesitate to predict newly added services as the root cause since it doesn't have any historical data about the service to make assumptions. So almost all published research use either the Key Performance Indicator (KPI) correlation or some variations of graphs-based methods to predict the root cause of the failures \citep{soldani2021anomaly}. + +\begin{longtable}{| p{23mm} | p{42mm} | p{42mm} | p{42mm} |} +\hline + \textbf{Technique} & + \textbf{Advantage} & + \textbf{Disadvantage} & + \textbf{Citations} \\ \hline + + KPI correlation & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item Could find indirectly affected services. + \item Easy to implement. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item Search space is large. + \item Could result in a lot of false positives and noisy outputs. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item \cite{nguyen2011pal} + \item \cite{nguyen2013fchain} + \item \cite{wang2020root} + \vspace{-7mm} + \end{itemize} \\ \hline + + Graph-based methods & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item Gives clear visual reasoning for the predications. + \item + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item Could miss out on indirectly affected components. + \item Computing the causality graph could be expensive. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=0mm,noitemsep,nolistsep,label={}] + \item \cite{samir2019dla} + \item \cite{wu2020microrca} + \item \cite{ma2020automap} + \item \cite{meng2020localizing} + \vspace{-7mm} + \end{itemize} \\ \hline + + \caption{Comparison of root cause identification techniques (self-composed)} +\end{longtable} + +\subsection{Evaluation} + +Since this project consists of 3 components working together, all these can be evaluated separately and in the end as the whole system. + +To evaluate an instrumentation system, the author is hoping to have use a static load generator like \href{https://github.com/MrSupiri/MicroSim}{MicroSim}. First a well established instrumentation system like linkerd will be installed and the static load generator will create some traffic. After the data is collected the proposed instrumentation system will be installed and same traffic will be simulated. Finally the data from the first experiment will be collated with second to evaluate reliability of the instrumentation system. + +Almost all the publication the author has reviews related to anomaly detection and root cause analysis, used three key evaluation metrics to measure the performance of these models, Precision, Recall, and F1 Score \citep{buckland1994relationship}. Precision denotes the proportion of correct positive classification (\( Precision = \frac{\text{True Positives}}{\text{True Positives} + \text{False Positives}} \)). In context of anomaly detection, how many events were actually anomalous over the models anomalous predictions. After training if the model had 1.0 as the precision which means model generated zero false positives. Recall in the other hand the exact opposite of this, It tries to identify from all the predictions how many were false negative (\( Recall = \frac{\text{True Positives}}{\text{True Positives} + \text{False Negatives}} \)). Problem with these two metrics are both it only explain the part of the story. For example model can have 1.0 precision by classifying all the things as non anomalous, If the test dataset is biased towards non anomalous then precision will get an artificial boost since model is rewarded more for getting false negatives. In order to fix that problem F1 score can be used. F1 score seeks a balance between both precision and recall and F1 score (\( F1 = 2*\frac{Precision*Recall}{Precision+Recall} \)) generally gives a better insights about classification models when there is imbalanced class distribution in the dataset \citep{Accuracy18:online}. diff --git a/documentation/PSPD/chapters/literature-review/chapter-overview.tex b/documentation/PSPD/chapters/literature-review/chapter-overview.tex new file mode 100644 index 0000000..1af2a73 --- /dev/null +++ b/documentation/PSPD/chapters/literature-review/chapter-overview.tex @@ -0,0 +1,4 @@ +\section{Chapter Overview} + +Since most of the research projects are continuation or different approaches to the existing problems, one of the first and most crucial steps in doing research is to conduct a literature survey. This usually has to be done to understand the problem domain, currently published worked about the problem, established methods of solving target and related problems, tools and technologies that can be utilized, and last but not least evaluation methodologies for that particular problem. So in this chapter author will discuss, how this problem came to be, What are the different components needed to be developed, and why and finally conclude the chapter with possible approaches and tools that can be used to solve the aforementioned problem. + diff --git a/documentation/PSPD/chapters/literature-review/chapter-summary.tex b/documentation/PSPD/chapters/literature-review/chapter-summary.tex new file mode 100644 index 0000000..c5ad4ea --- /dev/null +++ b/documentation/PSPD/chapters/literature-review/chapter-summary.tex @@ -0,0 +1,3 @@ +\section{Chapter Summary} + +In this chapter, the author discussed the origin and evaluation of \ac{sre} and \ac{aiops} and how that painted the way for the problem that's being tackled with this project. Then the author went to discuss the what are the techniques and tools that can be used to find answers to the problem of anomaly detection and the root cause analysis in distributed systems while listing down the pros and cons of every technique while citing sources for these claims. Finally, the author showcased the three main components of automated root cause analysis using the published literature and finally concluded the chapter with a comparison of the most relevant work for each of the identified components. \ No newline at end of file diff --git a/documentation/PSPD/chapters/literature-review/concept-map.tex b/documentation/PSPD/chapters/literature-review/concept-map.tex new file mode 100644 index 0000000..cfc0fa5 --- /dev/null +++ b/documentation/PSPD/chapters/literature-review/concept-map.tex @@ -0,0 +1,10 @@ +\section{Concept Map} + +During the literature survey the author has analyzed over fifteen different research publications and articles to get a deeper understanding about the problem domain. In the diagram below the author has summarize all of his finding to a visual representation and highlighted the issues and the technique he is willing to use in order to solve the problem. + + +\begin{figure}[H] + \includegraphics[width=16cm]{assets/literature-review/concept-map.png} + \caption{Concept map (self-composed)} + \label{fig:concept-map} +\end{figure} diff --git a/documentation/PSPD/chapters/literature-review/existing-work.tex b/documentation/PSPD/chapters/literature-review/existing-work.tex new file mode 100644 index 0000000..955819e --- /dev/null +++ b/documentation/PSPD/chapters/literature-review/existing-work.tex @@ -0,0 +1,251 @@ +\section{Existing Systems} + +As the large-scale migration towards the cloud and microservices started fairly recently the problem this research is trying to solve mostly affects large-scale enterprises there ain't a lot of published research on this domain. All the work done towards uncovering the root cause of failures by large co-operations either kept their finds for internal use to sell it as \ac{saas} product. + +One of the best implementations found on root cause analysis is from Datadog. They created a platform called watchdog \citep{Watchdog76:online} which monitors the entire system for anomalies and failures in the background. When a failure happens it tries to pull all the relevant stack traces and monitoring data to a single view so the developer can diagnose the problem easily. The problem with this solution is even though it was announced back in July 2018, all that is available is currently in private beta which not everyone has access to. +\\ +All the currently published work on microservices monitoring can be classified into 3 main categories +\begin{enumerate} +\item Instrumentation +\item Anomaly detection +\item Root cause identification +\end{enumerate} + +\subsection{Instrumentation} +One of the first steps that need to be done to get visibility into running service is to set up a data collection pipeline that collects performance data about the service and writes to persistent storage for later evaluation. + +Currently, the most popular way to collect telemetry data about micro-services is using an open-source tool called \href{https://prometheus.io/}{Prometheus} which was created by SoundCloud and later donated to \ac{cncf}. Usually, Prometheus has matched it \href{https://grafana.com/}{Grafana} visualized these data to make educated guesses. \cite{toka2021predicting} proposed Kubernetes native a data analytics pipeline that uses Prometheus for data scraping and runs a real-time analysis on them. The problem with this approach is to get some key data like the number of inbound and outbound requests, the service is required to be architectured to work with Prometheus. If it's third-party software without Prometheus integration, the system is limited to scraping metrics like CPU and Memory usage that are exposed by the operating system. + +\subsection{Anomaly detection} + + +Anomaly detection in time series is a field of its own. According to \cite{hagemann2020systematic} anomaly detection in cloud computing environments date backs to 2012 which are mostly based on statistical approaches. Since 2014 their big shift towards using machine learning-based approaches to detect anomalies. Due to the sheer number of data points modern systems generate and the complexity of those data. + +\begin{figure}[H] + \includegraphics[width=10cm]{assets/literature-review/num-of-anomaly-detection-papers.jpg} + \caption{Number of papers published work on anomaly detection in cloud computing environments \citep{hagemann2020systematic}} + \label{fig:num-of-anomaly-detection-paperss} +\end{figure} + +\subsubsection{Standard machine learning} + +\cite{du2018anomaly} tried experimenting on four of the most popular machine learning techniques to detect performance anomalies. To do this they used an open-source virtual IP Multimedia subsystem called Clearwater. Their system had three modules monitoring agent, data processing, and fault injector. To run the experiment they first used a load generator to create traffic inside the system. Then they used the monitoring agent to collect telemetry data while the fault injector is introducing random faults to the system. Finally, they combined the data from the monitoring agent and the fault injector to create the dataset to train four machine learning models. After training each model was a plugin to the data processing module and tested its precision, recall, and f1-score. In the end, they concluded K Nearest Neighbors classifier gives the most accurate classifications while Support vector machines have the worse. + + +\subsubsection{Encoder-decoder networks} + +\textbf{Auto-encoder} + +Auto-encoders are a type of neural network that will try to predict the input data from itself. To do that model has to learn a way to pass through the input data to the output layer through a bottleneck layer. After training this bottleneck layer is called latent space which compressed low-level representation of the entire input data distribution \citep{hinton2006reducing}. So the job of the autoencoder is to understand the underlying pattern in the given data distribution. + +Due to this nature, autoencoders have become a popular method of detecting anomalies in time series data since to find anomalies from input sequence one has to learn to identify normal and the abnormal. After training the model on the target dataset, it should be able to come up with the generalized function about the given dataset and it will be able to recreate any input sequence accurately. However, when there is an anomaly in the input sequence models, the output will be vastly different from the input because the model doesn't know how to recreate it properly. This reconstruction loss can be used as a metric to uncover anomalies within the system. In \cite{kumarage2018anomaly} authors used this method to detect anomalies in distributed systems. The main benefit for this approach dataset doesn't have to be labeled and the model learn to differentiate normal from the abnormal. + +\textbf{Generative adversarial networks} + +In a continuation of their work \cite{kumarage2019generative} they tried doing the same thing but in the opposite way using a \ac{gan} \citep{goodfellow2014generative}. Here the generator network tries to learn the target data distribution (non-anomalous dataset) while the discriminator is trying to classify the normal and abnormal. In this, the generator network acts as the decoder while the discriminator network tries to encode the generator's output into a single scalar value which is the anomaly score. Authors of the paper tried using a traditional \ac{gan} and Bidirectional Generative Adversarial Network (BiGAN) \citep{donahue2016adversarial} but in the end, they concluded even though it showed a tendency towards better performance when the dataset gets bigger, with the dataset they had autoencoders perform well overall. + +\subsubsection{Convolutional neural networks} + +Ever since DeepMind came up with wavenet which used a CNN to generate audio samples \citep{oord2016wavenet} researchers uncovered other potential use cases other than image-related tasks. One of those use cases was as CNN excels at pattern recognition, encoding time series data set into image-like data structures and using a CNN to identify abnormal patterns in it. On \cite{kim2018encoding} authors tried to use a novel technique to raw encode data into a pixel-like structure and found it could outperform the existing methods to detect anomalies in computer networks. In another work by \cite{zhang2019deep} Convolutional Long-Short Term Memory (ConvLSTM) with an attention mechanism which yielded temporal dependencies more accurately. + + +\subsubsection{Root Cause Identification} + +Predicting the exact root cause of failure just using a standard machine learning model is a pretty difficult task since prediction space is not finite. In 2017 a team from Google X tried using the Bayesian Network to model the relationship between the state of the system and its effect on failures \citep{chigurupati2017root}. Using it they were able to accurately predict all the possible root causes of a hardware failure in certain systems but this model required to predefine all the possible error modes by domain experts which isn't possible in a constantly evolving distributed system. There were similar attempts \cite{gonzalez2017root} to use machine learning to generate knowledge graphs on historical data and help developers come up with reasoning to failures although this eliminated a need for a domain expert, this also can't react to unseen errors. + +In a distributed system it's hard to spot real anomalies just by looking at monitoring data, but when there are huge spikes in response latencies or error rates it's a good indicator something must be wrong. So \cite{samir2019dla} used a \ac{hhmm} to uncover the possible affected services from changes in response time or error rates in one service and used that data to uncover the root cause of the issue. All of the papers discussed above have one problem in common they all assume the entire system is static but in reality, these services changes over time either with increased demand or new future implementations. To address this, \cite{wu2020microrca} developed a service that monitors all the running applications and their vitals. This also constructs an attributed graph that represents how each service interacts with the other. When the monitoring system detects an anomaly MicroRCA weight that graph with response time changes and tries to find the epicenter of the anomaly. The main problem with both of these approaches have is authors rely solely on slow response time as an indication of an anomaly but several other factors could course anomalous behaviors without changes in response times. + +\subsection{Comparison of Existing Systems} + +\begin{longtable}{| p{25mm} | p{62mm} | p{62mm} |} +\hline + \textbf{Research} & + \textbf{Improvements} & + \textbf{Citations} \\ \hline + + \multicolumn{3}{|c|}{\textbf{Instrumentation}} \\ \hline + + \cite{toka2021predicting} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item Explained a way to build a cloud-native data aggregation and analytics pipeline using open-source software. + \item The proposed system is not platform dependant. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item The overhead on the monitoring system is a bit high. + \item Analytics pipelines rely solely on KPI correlation. + \vspace{-7mm} + \end{itemize} \\ \hline + + + + \multicolumn{3}{|c|}{\textbf{Anomaly detection}} \\ \hline + + \cite{prabodha2017monitoring} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item Explained the most popular methods to detect anomalies. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item The authors didn't consider learning-based approaches. + \vspace{-7mm} + \end{itemize} \\ \hline + + \cite{kumarage2018anomaly} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item Due to the difficulty of finding labeled research data, they settled on using a semi-supervised technique. + \item Used randomized decision trees were utilized to select the most suitable features that correspond to each component. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item The model won't be easily transformable for other systems. + \item If more new key features were added to the system it will require total retraining. + \vspace{-7mm} + \end{itemize} \\ \hline + + \cite{kim2018encoding} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item Introduced a new encoding technique so CNN can identify anomalies better. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item Even though this outperformed the “gray-scale encoding” \citep{dasgupta2002anomaly} technique, a comparison study with random forest showed this method gets outperformed by random forest. + \vspace{-7mm} + \end{itemize} \\ \hline + + \cite{du2018anomaly} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item Introduced SLIs to monitor data. + \item A lot of good metrics (input data). + \item Performance monitoring of services and containers. + \item Tested multiple Machine learning models to see which one works best. + \item Introduce a Fault Injection System. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item Only can identify predetermined issues. + \item Require a sidecar that includes a lot of overhead. + \item Won't work with event-driven architectures. + \item Uses Supervised learning and it's near impossible to find real-world data with labels. + \vspace{-7mm} + \end{itemize} \\ \hline + + \cite{kumarage2019generative} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item Used a different approach to predict the future timesteps from the past events. + \item Which outperformed passed techniques when the monitored data points become larger. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item Accuracy is around 60\% which is not good to use in production with mission-critical systems. + \item As this is a GAN-based system, it may take a lot of resources to run with production systems. + \vspace{-7mm} + \end{itemize} \\ \hline + + \cite{zhang2019deep} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item Introduce a new embedding technique. + \item A used hybrid method which uses convolutional autoencoder and LSTM network. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item Feature maps aren't iterative to understand so operators have to blindly trust the network. + \item It will be hard to set the network to ignore expected anomalies. + \vspace{-7mm} + \end{itemize} \\ \hline + + + \multicolumn{3}{|c|}{\textbf{Root cause identification}} \\ \hline + + \cite{chigurupati2017root} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item Experimented with different metrics till it narrowed down. + \item Find hidden meaning in values that seems random. + \item Used a probabilistic approach to better understand the relationship between inputs and outputs. + \item Gives all the possible root causes of a given problem. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item Require support from domain experts. + \item Can't account for unforeseen error. + \vspace{-7mm} + \end{itemize} \\ \hline + + \cite{gonzalez2017root} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item Build a predictable system. + \item Automatic identification of dependencies between system events. + \item Doesn't Need to rely on Domain experts. + \item Generalized to different systems. + \item Used data windowing. + \item Randomly permuting to test how the model reacts to random inputs. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item The knowledge graph is good for visualization of the problem but still, people have to manually figure out what went wrong. + \item It assumes that the collaboration and knowledge of network operators and managers are available. + \item Random Forests doesn't scale well for a big input set. + \vspace{-7mm} + \end{itemize} \\ \hline + + \cite{samir2019dla} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item Custom HHMM model. + \item Self-healing mechanism. + \item Focus on performance detection and identification at the container, node, and microservice level. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item The input dataset scale is limited. + \item Require a sidecar. + \item Needs predetermined thresholds. + \vspace{-7mm} + \end{itemize} \\ \hline + + \cite{wu2020microrca} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item Created a custom Faults Injection module. + \item Uses an attribute graph to localize to faulty service. + \item Application-agnostic by using a service mesh. + \item Rely on service mesh to determine network topology. + \item Used Unsupervised learning. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=3mm,noitemsep,nolistsep] + \item Only able to identify 3 types of issues. + \item Looks only for performance anomalies. + \item Use the slow response time of a microservice as the definition of an anomaly. + \item Service meshes add a lot of overhead to systems. + \item Paper doesn't talk about services that ain't directly connected. + \item Won't work with event-driven architectures. + \vspace{-7mm} + \end{itemize} \\ \hline + + \caption{Review of existing systems (self-composed)} +\end{longtable} + + +\subsection{Benchmark} +Since both \ac{aiops} and automated root cause analysis emerging fields there isn't any standard way to benchmark the system against the existing ones. So the author is hoping to perform a baseline benchmark with all the existing work based on a few key metrics like f1-score and memory footprint of the system. \ No newline at end of file diff --git a/documentation/PSPD/chapters/literature-review/main.tex b/documentation/PSPD/chapters/literature-review/main.tex new file mode 100644 index 0000000..804127e --- /dev/null +++ b/documentation/PSPD/chapters/literature-review/main.tex @@ -0,0 +1,7 @@ +\chapter{Literature Review} +\input{chapters/literature-review/chapter-overview} +\input{chapters/literature-review/concept-map} +\input{chapters/literature-review/problem-domain} +\input{chapters/literature-review/approach} +\input{chapters/literature-review/existing-work} +\input{chapters/literature-review/chapter-summary} \ No newline at end of file diff --git a/documentation/PSPD/chapters/literature-review/problem-domain.tex b/documentation/PSPD/chapters/literature-review/problem-domain.tex new file mode 100644 index 0000000..ab75258 --- /dev/null +++ b/documentation/PSPD/chapters/literature-review/problem-domain.tex @@ -0,0 +1,71 @@ +\section{Domain Overview} + +\subsection{Introduction to Distributed Systems} + +Distributed Systems are a type of system which are designed to operate in a fragmented setting. This fragmented style helps the system to distribute its workload over many computers across a network which in itself makes scaling such a system easy as adding more computers to the network. This method of scaling is called horizontal scaling. Using this kind of fragmented architecture helps to increase the reliability of the system since the likelihood of a single hardware failure knocking out the entire system gets smaller and smaller as the network grows. + +In the early days, only large-scale enterprises could afford the cost of building distributed systems but in recent years with the rise of cloud computing \citep{CloudAdo16:online} creating our own distributed systems could be done with just a press of a few buttons. + +\subsubsection{Microservices} \label{sec:intro-microservices} + +This radical shift introduced a new paradigm of computing called Microservices. Where a bunch of small and self contain services work to-gather for a big and complex system. These services can be individually deployed and scaled. Due to this nature users can deploy replicas of a single service across many \acp{vm} and put a load balancer that will split the traffic between them. This method will allow the service to maintain availability even if multiple \acp{vm} which contain a copy of the service goes down \citep{chaczko2011availability}. + +\subsubsection{Containerization} + +Even though microservices helped more organizations meet a higher level of availability due to their decoupled nature. It was very difficult to manage the lots and lots of tiny services spread across hundreds of \acp{vm}. Since these services were isolated using a virtualization layer the cost and the overhead of maintaining a system like this were very high \citep{dua2014virtualization}. To mitigate this problem inspired by Logistics Industry, a new method to package an application called "Containerization" was invented. The rationale behind this technique was to package all the dependencies of the program to a single image without the operating system itself and when running share a single logically separated operating system across all the containers. So using this technique will reduce a lot of overhead in the system and also it make simple to move an application running on a local computer to a remote server due to the aforementioned dependency packaging technique. + +\begin{figure}[H] + \includegraphics[width=10cm]{assets/literature-review/containers-vs-virtual-machines.jpg} + \caption{Difference between hosting 3 apps in virtual machines vs Containers \citep{Dockervs91:online}} +\end{figure} + +\subsubsection{Container Orchestration} + +Even though containers solve a major headache when it comes to operating a distributed system, managing 100s of containers becomes a major challenge. The reason for this is when it comes to a large-scale distributed system, it's simply not possible to use one \ac{vm} to house all the containers. These containers need to be spread across dozes of \acp{vm} and in some cases different \ac{vm} vendors. Then networking, replication, security, and \textbf{monitoring} need to be accounted for. To solve all of these problems number of different container orchestration systems were introduced \citep{ElasticityCloudComputing}. But as per a survey done by Red Hat in July 2021, it's reviled that 88\% of people prefer to use Kubernetes as their container orchestrator while 74\% mentioned they use Kubernetes for their production workloads \citep{Kubernet59:online}. + +\begin{figure}[H] + \includegraphics[width=10cm]{assets/literature-review/Container-orchestration-engines.png} + \caption{Overview of a container orchestration engine \citep{ElasticityCloudComputing}} +\end{figure} + + +\subsection{Reliability Engineering} + +With the rise of cloud computing, a new culture of software development called DevOps emerged. According to \cite{kim2014phoenix} philosophy behind DevOps was adopted from "Toyota Way" \citep{liker2006toyota}. In that book author talks about "The Three Ways", +\begin{enumerate} +\item \textbf{Principles of Flow} - Workflow from left-to-right (from the requirement to production) and to maximize flow batch sizes need to be lowered. +\item \textbf{Principles of Feedback} - To increase the quality, feedback must be passed from right to left so that the entire idea to production workflow has a feedback loop. +\item \textbf{Principles of Continuous Learning} - Every failure is a learning opportunity. +\end{enumerate} +and these three principles essentially makeups the modern DevOps culture. So a well-implemented DevOps culture on an organization will yield much better results both the quality of the system and its reliability. + +As mentioned above DevOps is considered as a culture or set of abstract principles that break down the organizational silos to achieve a higher level of agility that was considered impossible some time ago. \ac{sre}, is the implementation of this abstract concept with clearly defined roles and tasks. Their main responsibility is to keep the system running smoothly as possible and adapt the infrastructure to fit the needs of the system. To achieve this \acp{sre} rely on number automation tools, which help them from building the software to extracting real-time insights while it's running on production. + + +\subsection{How to Identify the Root Course of a Failure}\label{sec:how-root-course} + + +To identify the root course of an issue in the system, three major steps need to be passed. +\begin{enumerate} +\item Detect there is an issue with the system. +\item Find all the affected services. +\item Estimate the most probable course. +\end{enumerate} + +Typically most of the distributed systems have some sort of monitoring system which collects telemetry data about the system in real-time. This allows the \acp{sre} to get a bird' eye view of the system's status. But to achieve a higher level of reliability it is crucial to keep tabs on every sub-component of the system, So it's possible to get understand its behavior at any given time. + +Even though this is the ideal scenario, this approach doesn't scale well. At some point, it's getting humanly impossible for the \ac{sre} team to keep track of all these services. So to solve this concept of \ac{sli} was introduced \citep{beyer2016site}. The idea behind this provides a quantitative measure of a very specific part of the system as it's faced to the end-users. For example request latency is one of the most commonly used \ac{sli}. This helps to lower the number of metrics \acp{sre} has to monitor but small errors that affect a minority of users could go unnoticed for months. + +To detect these kinds of issues, all the microservices in the system need to emit a lot of telemetry data and those data need to be individually processed in near real-time to catch errors early. The most widely adopted method of extra meaningful data from such data stream is setting up threshold-based alerts which will send notifications when there is a threshold violation. The main drawback to this issue is \acp{sre} has to predict both metrics that need to be monitored and "normal rates" for these metrics by looking at past data and that value has been valid for the future as well if the service is newly deployed it's really hard to get those two right. In fact on the 14th of December, 2020 all of Google's services went unresponsive due to Google's OAuth service running out of disk space and no one at Google didn't notice till user reports started flooding in \citep{Googleoutage:online}. + +Typically in distributed systems, services have a lot of interdependent connections to form the bigger system. When a dependant service is experiencing an issue for example the elevated level of request latency, it is possible for a service consuming that service to also show an elevated level of request latency. So when there is an outage or issue with the system \acp{sre} look at all the services with abnormal metric readings and try to make an educated guess of the most probable root course. This gets repeated until a real root course was found. The hardest part of this process is making the educated guesses of the most probable root courses and this requires the involvement of a system expert with a lot of experience with the target system. + +\begin{figure}[H] + \includegraphics[width=16cm]{assets/literature-review/demo.png} + \caption{Root course localization pipeline (self-compose)} +\end{figure} + +\subsection{Artificial Intelligence for IT operations} + +When it comes to IT operations like managing infrastructure and identifying the root course of failure like mentioned in the section \ref{sec:how-root-course}there tend to be a lot of both structured and unstructured data sources. \ac{aiops} is an emerging field \citep{Artifici8:online} where both data scientists and reliability engineers cooperate to build data-driven, smarter systems with help of machine learning to achieve a higher quality of service which ain't simply possible by using traditional methods due to the density and the complexity of datasets. + diff --git a/documentation/PSPD/chapters/methodology/development-methodology.tex b/documentation/PSPD/chapters/methodology/development-methodology.tex new file mode 100644 index 0000000..bed416c --- /dev/null +++ b/documentation/PSPD/chapters/methodology/development-methodology.tex @@ -0,0 +1,15 @@ +\section{Development Methodology} + +Even though this project has few clearly defined requirements, designing and developing them will require an iterative model as there isn't a single best way to develop this and the author will be experimenting with different techniques. Thus the author decides on using \textbf{prototyping} as the \ac{sdlc} Model for this project.\\ + +\subsection{Design Methodology} + +To design the system diagrams for this project \ac{ooad} methods will be used. \ac{ooad} make it easier to design the system iterative and this complement the choice \ac{sdlc} method Prototyping. + +\subsection{Evaluation Methodology} + +During the literature, the survey author concluded that there are not any specific evaluation metrics for the root cause analysis system other than accuracy and f1 score, and there are not any publicly available datasets or systems to benchmark against. Base-level benchmarks will be carried out to compare the proposed system with the existing ones. + +\subsection{Requirements Elicitation} + +As the results of this project will be mostly used by \acp{sres} and system administrator the author is hoping to talk with few of the experts in the respective fields to get a better idea on what are the things to be expected from a system like this. Moreover as mentioned in \ref{sec:out-scope} this system is not designed to entirely replace existing monitoring systems, So the author is hoping to research about production monitoring systems and their workflows to understand how the proposed system could seamlessly integrate them. diff --git a/documentation/PSPD/chapters/methodology/introduction.tex b/documentation/PSPD/chapters/methodology/introduction.tex new file mode 100644 index 0000000..a200b51 --- /dev/null +++ b/documentation/PSPD/chapters/methodology/introduction.tex @@ -0,0 +1 @@ +\section{Chapter Overview} \ No newline at end of file diff --git a/documentation/PSPD/chapters/methodology/main.tex b/documentation/PSPD/chapters/methodology/main.tex new file mode 100644 index 0000000..aec4741 --- /dev/null +++ b/documentation/PSPD/chapters/methodology/main.tex @@ -0,0 +1,6 @@ +\chapter{Methodology} +\input{chapters/methodology/introduction} +\input{chapters/methodology/research-methodology} +\input{chapters/methodology/development-methodology} +\input{chapters/methodology/project-management-methodology} +\input{chapters/methodology/summary} \ No newline at end of file diff --git a/documentation/PSPD/chapters/methodology/project-management-methodology.tex b/documentation/PSPD/chapters/methodology/project-management-methodology.tex new file mode 100644 index 0000000..7adaba3 --- /dev/null +++ b/documentation/PSPD/chapters/methodology/project-management-methodology.tex @@ -0,0 +1,136 @@ +\section{Project Management Methodology} + +To manage task of this project authors decide to use \textbf{Agile PRINCE2}. Agile PRINCE2 built upon waterfall method which works best for projects with fixed deadlines and requirements with the added benefit of having regulated inputs and outputs \citep{WhatAreT79:online}. + +\subsection{Deliverables} +\setlength\LTleft{0mm} +\begin{longtable}{|p{115mm}|p{35mm}|} +\hline +\textbf{Deliverable} & \textbf{Date} \\ \hline +\textbf{Draft Project Proposal} & \multirow{2}{*}{02nd September 2021} \\ +A draft version of this proposal & \\ \hline +\textbf{A working beta of MicroSim}\label{microsim} & \multirow{2}{*}{15th September 2021} \\ +MicroSim is a tool that simulates a distributed system within a Kubernetes cluster. This tool will be used to test and evaluate the final version of this project & \\ \hline +\textbf{Research Paper about MircoSim} & \multirow{2}{*}{16th October 2021} \\ +MicroSim could have various other use-cases and could help in the development of this research domain. So the author is planning to release it as an open-source project with paper so future research and benefits from this. & \\ \hline +\textbf{Literature Review Document} & \multirow{2}{*}{21st October 2021} \\ +The Document explaining all the existing tools and published researches on the domain & \\ \hline +\textbf{Project Proposal} & \multirow{2}{*}{04th November 2021} \\ +The final version of this project proposal. & \\ \hline +\textbf{Software Requirement Specification} & \multirow{2}{*}{25th November 2021} \\ +The Document all the key requirements that are gonna get address with this research & \\ \hline +\textbf{Proof of Concept} & \multirow{2}{*}{06th December 2021} \\ +Unoptimized prototype with all the main features working & \\ \hline +\textbf{Interim Progress Report (IPR)} & \multirow{2}{*}{27th January 2022} \\ +The document explaining all the preliminary findings and the current state of the project & \\ \hline +\textbf{Test and Evaluation Report} & \multirow{2}{*}{17th March 2022} \\ +A document with results of the project and conclusion made from those tests & \\ \hline +\textbf{Draft Project Reports} & \multirow{2}{*}{31st March 2022} \\ +The draft version of the final thesis & \\ \hline +\textbf{Final Research Paper} & \multirow{2}{*}{14th April 2022} \\ +A paper with results about this project & \\ \hline +\textbf{Final Project Report} & \multirow{2}{*}{28th April 2022} \\ +Finalize version of the thesis & \\ \hline +\caption{Deliverables and due dates (self-composed)} +\end{longtable} + +\subsection{Schedule} +% Gantt chart is a visualization of the task with their respective timelines. Refer Appendix \ref{appendix:gantt-chart} to find the gantt chart for this project. +\begin{figure}[H] + % \chapter{Gantt Chart} \label{appendix:gantt-chart} + % \centering + % \includegraphics[width=15cm]{assets/gantt-chart.jpg} + \includegraphics[height=22cm]{assets/methodology/gantt-chart.jpg} + \caption{Defined gantt chart for the project (self composed)} +\end{figure} + + +\subsection{Resource Requirement} + +\subsubsection{Software Requirements} + +\begin{itemize}[noitemsep,nolistsep] +\item \textbf{Ubuntu / Arch Linux} - Since this project will use \ac{ebpf} as a dependency it will require a Linux kernel based operating system. +\item \textbf{Python / R} - This project has a data science. So using a language with good data science eco-system will make this process easier. +\item \textbf{GoLang / Rust} - While GoLang has official client library made by Kubernetes developers themselves, kube-community has developed an excellent alternative in Rust. +\item \textbf{K3d / Minikube} - To create a Kubernetes cluster locally for development and testing. +\item \textbf{Jetbrain IDEs / VS Code} - IDE provides lot of tools that will help developing complex project like this easily. +\item \textbf{Google Docs / Overleaf} - To create documation about the project the author can use a usual editor like Google Docs or declaratively tool like Overleaf which use coding like style to format the document. +\item \textbf{Google Drive / Github} - Offsite location to backup the codebase and related documents. +\item \textbf{ClickUp / Notion} - To manage the project and keep track of things to be done. +\end{itemize} + +\subsubsection{Hardware Requirements} +\begin{itemize}[noitemsep,nolistsep] + \item \textbf{Quad-core CPU with AVX support} - AVX is a CPU instruction set which is optimze for vector operations. Having an AVX supported CPU could reduce the model inference time. + \item \textbf{GPU with CUDA support and 2GB or more VRAM} - Both Tensorflow and Pytorch depend on CUDA for hardware-accelerated training. Training on GPU could save a lot of time increases the number of trial and error iterations that could be done. + \item \textbf{16 GB or more Memory} - Running a microservices simulation locally will consume a lot of memory and while testing models will get loaded into RAM. + \item \textbf{At least 40GB disk space} - To store the dataset, models docker containers while developing the project. +\end{itemize} + +\subsubsection{Skill Requirements} +\begin{itemize}[noitemsep,nolistsep] + \item \textbf{Experience working with Kubernetes} - The author will be developing a Kubernetes extension so they need to know the inner workings of Kubernetes. + \item \textbf{Data engineering} - Developing a data encoding technique requires a lot of knowledge in how to manipulate a given dataset. + \item \textbf{Model engineering} - Creating model from ground up is difficult task. So the author needs to have an in-depth idea about a machine learning framework and how different layers in the model work in order to fit them properly. +\end{itemize} + +\subsubsection{Data Requirements} +\begin{itemize}[noitemsep,nolistsep] +\item \textbf{Monitoring dataset} - This dataset can be collected using \hyperref[microsim]{MicroSim} tool author plan to develop to simulate distributed system. +\end{itemize} + +\subsection{Risk Management} + + +\begin{longtable}{|p{4.8cm}|p{1.35cm}|p{1.8cm}|p{7cm}|} + \hline + \textbf{Risk Item} & + \textbf{Severity} & + \textbf{Frequency} & + \textbf{Mitigation Plan} + \\ \hline + + The hypothesis the research is based on is wrong & + 5 & + 1 & + Present the findings and explain why the hypothesis was wrong + \\ \hline + + Failure in work computer & + 4 & + 3 & + Daily backup work the work to a cloud platform + \\ \hline + + Lack of domain knowledge & + 2 & + 3 & + Talk to a domain expert, Do more research + \\ \hline + + Models not generalizing & + 3 & + 4 & + Explore different methods, Try cleaning up the dataset more + \\ \hline + + Dataset quality is not up to the standard & + 4 & + 1 & + Use a method used in related researches to create a new dataset + \\ \hline + + Running out of time & + 1 & + 2 & + Following a thorough work schedule + \\ \hline + + Getting sick and unable to work for few days & + 3 & + 3 & + Keeping few days of a buffer period before deadlines + \\ \hline + \caption{Risks and mitigations (self-composed)} +\end{longtable} \ No newline at end of file diff --git a/documentation/PSPD/chapters/methodology/research-methodology.tex b/documentation/PSPD/chapters/methodology/research-methodology.tex new file mode 100644 index 0000000..6b4231c --- /dev/null +++ b/documentation/PSPD/chapters/methodology/research-methodology.tex @@ -0,0 +1,26 @@ +\section{Research Methodology} + + +\begin{longtable}{|p{35mm}|p{125mm}|} +\hline + \textbf{Research Philosophy} & + Mainly, there are four research philosophies, Pragmatism, positivism, realism, and interpretivism. It explains the belief and the research is done. After doing an in-depth study about research philosophies, the author decided on following \textbf{Pragmatism} as the research philosophy because the author believes there is no one way to solve the problem this research is tried to address and the goal of this research is to solve a practical problem faced by \acp{sres}. (\cite{1Philoso75:online}, \cite{Pragmati87:online}) + \\ \hline + + \textbf{Research Approach} & + Although the inspiration for the research came from an observation of the real world. The author is using \textbf{deductive reasoning} to approach the problem. After the problem was identified the author looked for existing work found few theories on the domain. Then the author found few flaws in these methods thought of a way to address them with different approaches. At the end of the research other hopes to implement these new approaches and observe their outcome. + \\ \hline + + \textbf{Research Strategy} & + The research strategy will be used to answer the research questions. In this project, the author will use \textbf{experimenting, interviews, and surveys} to provide answers to research questions. + \\ \hline + + \textbf{Research Choice} & + During this research project, the author is planning to build a very generalized solution to predict anomalies. So to achieve this, a \textbf{quantitative} dataset will be used to train the model while a \textbf{qualitative} data set will be used for evaluate it. So the data for this research will be collected by using the \textbf{Mixed method}. + \\ \hline + + \textbf{Time zone} & + This project needs to be completed within 8 months, so a \textbf{cross-sectional} time horizon will be used to collect data to complete the project. + \\ \hline + \caption{Research methodology selection (self-composed)} +\end{longtable} diff --git a/documentation/PSPD/chapters/methodology/summary.tex b/documentation/PSPD/chapters/methodology/summary.tex new file mode 100644 index 0000000..50d0381 --- /dev/null +++ b/documentation/PSPD/chapters/methodology/summary.tex @@ -0,0 +1 @@ +\section{Chapter Summary} \ No newline at end of file diff --git a/documentation/PSPD/chapters/requirement-specification/chapter-overview.tex b/documentation/PSPD/chapters/requirement-specification/chapter-overview.tex new file mode 100644 index 0000000..67914c8 --- /dev/null +++ b/documentation/PSPD/chapters/requirement-specification/chapter-overview.tex @@ -0,0 +1,3 @@ +\section{Chapter Overview} + +In this chapter, the author will be talking about the proposed system's requirements and how they were discovered. To start this, a rich picture diagram will be presented which will explain the high-level overview of the system's interaction with its stakeholders. After that, there will be an in-depth analysis of all the positive and negative stakeholders while explaining how each of them views the system. The requirements for the project will be discovered using multiple techniques and both requirement discovery techniques and discovered requirements will be explained within this chapter. Finally, the chapter will be concluded with a context diagram that will explain the system boundaries and a use case diagrams which will be used to visualize the relationships between the functions and the users of the system. \ No newline at end of file diff --git a/documentation/PSPD/chapters/requirement-specification/context-digram.tex b/documentation/PSPD/chapters/requirement-specification/context-digram.tex new file mode 100644 index 0000000..b8da2f9 --- /dev/null +++ b/documentation/PSPD/chapters/requirement-specification/context-digram.tex @@ -0,0 +1,10 @@ +\section{Context Diagram} + +Since the proposed system falls into the category of an open system, it communicates with a lot of external parties. It's advised to have a clear system boundary when developing such a system so the development process doesn't get overwhelming. Figure \ref{fig:context-digram} explain the main interactions between the system and the third parties. + +\begin{figure}[H] + % \setlength{\fboxsep}{10pt} + \includegraphics[width=13cm]{assets/requirement-specification/contex-digram.png} + \caption{Context diagram (self-composed)} + \label{fig:context-digram} +\end{figure} diff --git a/documentation/PSPD/chapters/requirement-specification/data-analysis.tex b/documentation/PSPD/chapters/requirement-specification/data-analysis.tex new file mode 100644 index 0000000..738d262 --- /dev/null +++ b/documentation/PSPD/chapters/requirement-specification/data-analysis.tex @@ -0,0 +1,138 @@ +\section{Analysis of Gathered Data} + +\subsection{Literature Review} + +To find the root course of fault in a distributed system, three components are required: instrumentation, anomaly detection, and root course localization. A literature review was conducted on each of the components to derive the requirements needed to build an automated root course analysis platform. + +% To find the root course of fault in a distributed system, three components are required, instrumentation, anomaly detection, and root course localization. Those three must work together to give a final prediction. From these three, anomaly detection has a lot of possible pathways to approach the problem. This is mainly due to anomaly detection being a broad topic that isn't limited to cloud computing. For the sake of this project using a semi-supervised technique with a convolutional autoencoder network seemed like the idle approach since those networks are lightweight and more reliable when it comes to generalizing. For instrumentation use of a Linux kernel feature called \ac{ebpf} seemed the current treading method due to it being very lightweight and not requiring any code changes to the existing system. Finally for the root course localization almost all the published work used graphed based method one way or another due to the nature of the problem being always dynamic. So the author decide to rely on a weighted graph to find a possible root course after an anomaly is detected. + +\begin{longtable}{|p{105mm}|p{50mm}|} + \hline + \textbf{Finding} & + \textbf{Citation} \\ \hline + + \ac{ebpf} is a modern, low overhead technique to extract telemetry by tracing Linux kernel calls. & + \cite{LKMLIngo52:online} \\ \hline + + To find the root course of a fault in a distributed system, three components are required: instrumentation, anomaly detection, and root course localization. & + \cite{wu2020microrca} \\ \hline + + Unsupervised learning algorithms excel at identifying unknown pattern. & + \cite{silver2017mastering}, \cite{kumarage2018anomaly}, \cite{khoshnevisan2019rsm} \\ \hline + Convolutional autoencoder network will give the best performance to resources required ratio when it comes to detecting anomalies. & + \cite{zhang2019deep}, \cite{khoshnevisan2019rsm} \\ \hline + + Root course could be identified by weighting anomaly scores in directed graphs. & + \cite{samir2019dla}, \cite{wu2020microrca}, \cite{ma2020automap}, \cite{meng2020localizing} \\ \hline + There isn’t a common benchmarking or testing platform to test root causes of finding algorithms. & + \cite{wu2020microrca}, \cite{soldani2021anomaly} \\ \hline + + Kubernetes is the go to method to manage distributed systems & + \cite{CloudNat36:online} \\ \hline + + \caption{Requirements derived from literature review (self-composed)} + +\end{longtable} + +\subsection{Interviews} + +A set of qualitative interviews was conducted to gather external feedback and opinion about the project and proposed system. For this, a senior production engineer, a Senior software engineer, a software engineer, and a trainee DevOps engineer were interviewed. The following table breaks down themes that were emerged after analyzing the transcript and findings based on that. +\newpage +\begin{longtable}{|p{30mm}|p{61mm}|p{60mm}|} + \hline + \textbf{Theme} & + \textbf{Review} & + \textbf{Evidence} \\ \hline + + Finding the root cause & + Almost all the interviewees agreed that whenever there is an outage, it’s a cumbersome task to find the root course of that outage. Having a system that makes this experience even slightly better would have huge implications. & + "I mean so often, sometimes it's identifying what the problem even is.” - Matthias Rampke (00:31:39) + \newline + \newline + “I mean, figuring out the problem is always hard because you're under stress.” - Jacob Payne (00:23:27) + \\ \hline + + + + ML for RCA & + This is a bit of a gray area. One of the experts I interviewed said he had a long history of working with time series forecasting and results are generally mediocre due to the sheer number of external factors. But since this is a closed system results may vary. & + “Ideally it helps me understand why the AI thinks this service is the problem.” \newline- Matthias Rampke (00:35:51) + \newline + \newline + “It could also be useful if it could trace outage implications.” \newline- Jonathan Reiter \\ \hline + + + Kubernetes & + Almost all the technical experts I have talked to agreed-upon Kubernetes is the way to manage distributed systems in the modern era and building this system tailor-made to Kubernetes was a wise choice. & + “An interesting shift that's been happening in this and it's sort of driven by Kubernetes, is from tools that imperatively do things to tools that pull a desired state and make it so.” \newline- Matthias Rampke (00:25:11) + \newline + “I mean, in the last five years, obviously Kubernetes blew up. That's been huge for the DevOps community.” \newline- Jacob Payne (00:16:38) \\ \hline + + + Service Meshes & + Experts have a “love-hate” relationship with Service Meshes. Service Meshes offers a lot of utility functions and useful data for debugging. But at the same time, they require a lot of resources to configure and maintain. & + “Going through a service mesh is a big investment, so I think there is a space for a smaller thing.” \newline- Matthias Rampke (00:05:04) \\ \hline + + Proposed Architecture & + During interviews, the author showed the Alpha version of the product along with the high-level system diagram, and all the interviewees were impressed by it and excited to get their hand on the final product. & + “I'm really looking forward to seeing it in action.” \newline- Matthias Rampke (00:11:10) + \newline + \newline + % “That's me, I think that's a lot of work, even for a capstone project to be on.” \newline- Jonathan Reiter (00:13:56) + % \newline + % \newline + “I can tell you the project as it is right now has value and if you can get an MVP out there that can be installed as a side car, that's going to be huge.” \newline- Jacob Payne (00:36:09) \\ \hline + +\caption{Inductive thematic analysis of interviews (self-composed)} +\end{longtable} + +\subsection{Self-evaluation} + + + +\begin{longtable}{|p{50mm}|p{105mm}|} + \hline + \textbf{Criteria} & + \textbf{Finding} \\ \hline + + Ways to integrate with Kubernetes & + There are two main ways to implement a system that interfaces with Kubernetes. From those creating standalone services that talk to Kubernetes API is the most straightforward method. But during Google Summer of Code 2021, the author worked on a project which uses a Kubernetes operator framework which is used to extend the functionality of Kubernetes. After evaluating both options it was decided to rely on the Kubernetes operator framework to build the controller for this project since it's a proven and reliable method to build services that interface with Kubernetes \citep{Introduc93:online}. \\ \hline + + Telemetry extraction method & + Service meshes are currently the most common way of extracting telemetry but there is a huge focus on \ac{ebpf} related products due to it’s efficiency. \\ \hline + + New trends in DevOps & + From CI/CD to GitOps, DevOps engineers are trying to automate everything that can be automated to minimize the human errors \citep{CloudNat36:online}. \\ \hline + + Case studies on recent high profile outages & + Few months ago Microsoft experienced a global outage that took more than 24 hours to fully recover from. This waas due to a single bad update to one of their authentication systems \citep{Microsof81:online}. \\ \hline + + \caption{Requirements derived from self-evaluation (self-composed)} +\end{longtable} + +% During the Self-evaluation of the existing system, it was made clear using \ac{ebpf} as the instrumentation technique has many benefits which range from very low overhead to fast and easier deployment but It comes with a few cons as well. The most notable example of this is the reliability of the collected data. Since \ac{ebpf} tracing works in lower of the TCP stack, there is a higher chance for corrupted data when compared to application layer level tracing which has all the bad packets filtered out. Still, the data points needed for this system like request rate can be calculated with good accuracy. + +% Another reoccurring factor that was identified during this process was the use of Kubernetes. According to a recent survey done by Cloud Native Computing Foundation, 91\% of respondents stated they rely on Kubernetes to manage their container infrastructure \citep{CloudNat36:online}. So building the proposed system tailor-made for Kubernetes would reap the maximum benefit. + +\subsection{Brainstorming} + +\begin{longtable}{|p{50mm}|p{105mm}|} + \hline + \textbf{Criteria} & + \textbf{Finding} \\ \hline + + Best way to detect anomalies & + Learning objective of an autoencoder is given X input, Output X. Even though this doesn’t make sense as it is, this process allows the network to deeply understand the underlying function of the given data distribution. So after training at right conditions the model can output a value very close to input resulting in low reconstruction loss. But if the input data is something the model hasn’t seen during the training process (novel anomaly) model will produce a higher reconstruction loss. This could be used as a signal to identify the health of service. \\ \hline + + \caption{Requirements derived from brainstorming (self-composed)} +\end{longtable} + +\subsection{Prototyping} + +At the start of this project, a simple Proof of Concept (POC) was developed to understand the feasibility of the project. The experiment started with taking a Sine function and concatenating it with a noise function to create a variating data sequence. This was done to emulate service metrics in a small and easy-to-understand way. After that, the author created a simple encoder-decoder network which was tasked with giving an input sequence of 0 to n to predict the n to n+10 on the sequence. Figure \ref{fig:poc-autoencoder}shows the results of the experiment. The blue line shows the input given to the network while the green shows the ground truth results. Finally, the orange line is the model that predicts how the metric should be in any given time step. Notice there is a sudden dip around t=80, it's an artificially injected anomaly that isn't present in the training dataset and due to that, there is a clear difference between expected the current readings from the metric. The author's idea was to use this difference to find anomalies in real-time metrics. Even though this worked well for small-scale prototypes the author found that this doesn't translate to highly noisy sequences patterns found in production servers but this could be used as the entry point to a more robust solution. Refer to the Appendix \ref{appendix:poc-results} to read more about this experiment. + +\begin{figure}[H] + \includegraphics[width=14cm]{assets/requirement-specification/poc-autoencoder.png} + \caption{The result from the proof of concept (self-composed)} + \label{fig:poc-autoencoder} +\end{figure} diff --git a/documentation/PSPD/chapters/requirement-specification/data-summary.tex b/documentation/PSPD/chapters/requirement-specification/data-summary.tex new file mode 100644 index 0000000..650179f --- /dev/null +++ b/documentation/PSPD/chapters/requirement-specification/data-summary.tex @@ -0,0 +1,74 @@ +\section{Summary of Findings} + +\begin{longtable}{|p{105mm}|p{6mm}|p{6mm}|p{6mm}|p{6mm}|p{6mm}|} +\hline + \textbf{Findings} & + \rotatebox{90}{\textbf{Literature Review }} & + \rotatebox{90}{\textbf{Interviews}} & + \rotatebox{90}{\textbf{Self-evaluation}} & + \rotatebox{90}{\textbf{Brainstorming}} & + \rotatebox{90}{\textbf{Prototyping}} \\ \hline + + Finding the root cause of a problem during an outage is a time consuming task. & + & + X & + X & + & \\ \hline + + It is possible to use machine learning to find anomalies from time series data. & + X & + & + & + & X \\ \hline + + Lot of companies are looking to migrate from monolithic architecture to microservices. & + X & + X & + & + & \\ \hline + + There is a big trend towards “automating boring tasks”. & + & + X & + X & + & \\ \hline + + Kubernetes is the most popular way to manage distributed systems. & + X & + X & + X & + & \\ \hline + + \ac{ebpf} provides a low overhead method to collect telemetry data without additional instrumentation. & + X & + & + X & + & X \\ \hline + + Autoencoder are good at finding and forecasting patterns in data. & + X & + & + & X + & X \\ \hline + + There aren't any established methods to test monitoring systems, Creating a testing toolkit for that would help future researchers. & + X & + X & + & + & \\ \hline + + Having a dashboard which could show a blast radius of a system failure would help to reduce the \ac{mttr}. & + & X + & + X & + & \\ \hline + + The Kubernetes operator framework is the best way to build Kubernetes native applications. & + & + X & + & + X & + X \\ \hline + + \caption{Summary of findings (self-composed)} +\end{longtable} \ No newline at end of file diff --git a/documentation/PSPD/chapters/requirement-specification/main.tex b/documentation/PSPD/chapters/requirement-specification/main.tex new file mode 100644 index 0000000..43098f5 --- /dev/null +++ b/documentation/PSPD/chapters/requirement-specification/main.tex @@ -0,0 +1,21 @@ +\chapter{System Requirements Specification} + +\input{chapters/requirement-specification/chapter-overview} + +\input{chapters/requirement-specification/rich-picture} + +\input{chapters/requirement-specification/stakeholders} + +\input{chapters/requirement-specification/requirement-elicitation} + +\input{chapters/requirement-specification/data-analysis} + +\input{chapters/requirement-specification/data-summary} + +\input{chapters/requirement-specification/context-digram} + +\input{chapters/requirement-specification/use-cases} + +\input{chapters/requirement-specification/requirements-specification} + +\input{chapters/requirement-specification/summary} \ No newline at end of file diff --git a/documentation/PSPD/chapters/requirement-specification/requirement-elicitation.tex b/documentation/PSPD/chapters/requirement-specification/requirement-elicitation.tex new file mode 100644 index 0000000..28aa66b --- /dev/null +++ b/documentation/PSPD/chapters/requirement-specification/requirement-elicitation.tex @@ -0,0 +1,26 @@ +\section{Requirements Elicitation Methodologies} + +When developing a software project, one of the very first steps that need to be done is requirements engineering. Without following this process, it is difficult to come up with a product that users actually want to use. In this section, the author will describe the techniques he has used to gather requirements with their results. + +\newpage + +\begin{longtable}{|p{160mm}|} +\hline +\textbf{Literature Review} \\ \hline +A Literature review is the fundamental building block of any research project. It helps to understand existing systems and how they work, and also Issues and gaps in those established systems. Since this research project has 3 sub-components to make up the full system, a literature review was done on existing instrumentation, anomaly detection, and root course localization systems. \\ \hline + +\textbf{Interviews} \\ \hline +The target audience for this project will be mostly reliability engineers. Having a one-on-one interview with them and discussing about the project idea and the implementation path uncovered some overlooked use cases and possible improvements to the current implementation. \\ \hline + + +\textbf{Self-evaluation} \\ \hline +Since the initial idea for the project came from a difficulty the author faced maintaining a distributed system during the industrial placement period. The author was able to carry out several self-evaluations during the course of the project and realigned the project scope with the original issue so the project is always on track. \\ \hline + +\textbf{Brainstorming} \\ \hline +As mentioned above since this project originated from a practical problem the author faced. The author was able to use his own experience and self brainstorm some key requirements of the project that he would personally like to be included. \\ \hline + +\textbf{Prototyping} \\ \hline +As the system is getting built, requirements get added and removed due to some requirements getting too complex to build or some additional requirements need to be met to have the core functionality working. This also gives an opportunity to share the current progress with a subset of the target audience and get their feedback and improve upon them. \\ \hline + +\caption{Selected requirement elicitation methods (self-composed)} +\end{longtable} diff --git a/documentation/PSPD/chapters/requirement-specification/requirements-specification.tex b/documentation/PSPD/chapters/requirement-specification/requirements-specification.tex new file mode 100644 index 0000000..3fd9684 --- /dev/null +++ b/documentation/PSPD/chapters/requirement-specification/requirements-specification.tex @@ -0,0 +1,246 @@ +\section{Requirements Specifications} + +Since this project touched the deep ends of both \ac{sre} and data science expectations. The prioritizes of the system must be managed to achieve a reliable and functioning system at end of the deadline. So to achieve this MoSCoW prioritization model was used. + +\begin{longtable}{|p{25mm}|p{128mm}|} +\hline + \textbf{Priority Level} & + \textbf{Description} \\ \hline + + Must have & + Requirements which need to be met in order to have minimum viable product. \\ \hline + + Should have & + Requirements that need to be completed to have a usable product. \\ \hline + + Could have & + Nice to have requirements that would improve the quality of life of the system. \\ \hline + + Will not have & + Requirements that will not get covered during this iteration but might get implemented in the future.\\ \hline +\caption{Requirement priorities (self-composed)} +\end{longtable} + +\begin{longtable}{|p{35mm}|p{118mm}|} +\hline + \textbf{Use Case ID} & \textbf{Use Case Name } \\ \hline + UC-01 & Deploy Lazy Koala \\ \hline + UC-02 & Update Configuration \\ \hline + UC-03 & Purge Lazy Koala \\ \hline + UC-04 & Check for Root Courses \\ \hline + UC-05 & Generate Report \\ \hline + UC-06 & Read from the database \\ \hline + UC-07 & Extract telemetry (Every 5 second) \\ \hline + % UC-08 & Update Network topology \\ \hline + UC-08 & Check for Anomalies (Every 1 minute) \\ \hline + UC-09 & Write to the database \\ \hline + UC-10 & Reconcile on modified resources \\ \hline + UC-11 & Update cluster state \\ \hline +\caption{Use cases of the system (self-composed)} +\end{longtable} + +% \newpage + +\newcommand{\functionalRequirement}[5]{ + #1 & + % \makecell[{{p{109mm}}}]{\textbf{#2}\\#3} & + \textbf{#2} \newline #3 & + #4 & + #5 \\ \hline +} + +\newpage +\subsection{Functional Requirements} + +\begin{longtable}{|p{9mm}|p{109mm}|p{14mm}|p{13mm}|} +\hline + \textbf{ID} & + \textbf{Requirement and Description} & + \textbf{Priority Level} & + \textbf{Use case} \\ \hline + + + \functionalRequirement + {FR01} + {Users should be able to deploy the \ac{lazy-koala-operator} to an existing Kubernetes cluster.} + {\ac{lazy-koala-operator} should work on any Linux-based Kubernetes cluster with version 1.22 or higher without any additional configuration from the user's end.} + {Must have} + {UC-01} + + \functionalRequirement + {FR02} + {Users should be able to remove the \ac{lazy-koala-operator} completely from the cluster.} + {Once uninstalled all the provisioned resources should be cleaned up by the \ac{lazy-koala-operator} itself. } + {Should have} + {UC-03} + + + \functionalRequirement + {FR03} + {Users should be able to specify which services need to be monitored.} + {System should allow the user to exclude some services getting tracked.} + {Must have} + {UC-02} + + + \functionalRequirement + {FR04} + {Users should be able to see the services monitored by \ac{lazy-koala-operator}} + {System should be transparent to the user about monitored and unmonitored service.} + {Could have} + {UC-05} + + + \functionalRequirement + {FR05} + {\ac{lazy-koala-operator} should deploy an instance of \ac{gazer} to every node in the cluster.} + {In the Kubernetes cluster every node has a separate instance of the Linux kernel. So for every instance of Linux kernel, an instance of \ac{gazer} must be present to ensure all the relevant data is captured.} + {Must have} + {UC-01, UC-10} + + + \functionalRequirement + {FR06} + {\ac{gazer} should intersect all “inet\_sock\_set\_state” kernel calls and export the relevant data to Prometheus.} + {Whenever a userspace application makes a TCP call, this kernel method is invoked to communicate with the network interface. Inspecting the data structures of this will allow us to extract a lot of information about each TCP calls.} + {Must have} + {UC-07, UC-09} + + + \functionalRequirement + {FR07} + {\ac{gazer} periodically poll the size of “sk\_ack\_backlog” for interested ports to export the relevant data to Prometheus.} + {This kernel data structure holds the TCP connections that are left to be acknowledged. Knowing the size of this queue will help to understand the efficiency of each service.} + {Should have} + {UC-07, UC-09} + + + \functionalRequirement + {FR08} + {\ac{gazer} should poll Kubernetes metric server periodically and export the relevant data to Prometheus.} + {Sudden changes in CPU and Memory usage will be a good indication for an anomaly. So exporting those to be processed later will be wise.} + {Must have} + {UC-07, UC-09} + + + \functionalRequirement + {FR09} + {\ac{lazy-koala-operator} should periodically check for changes in monitored services and update the \ac{gazer} ConfigMap.} + {Kubelet is watching over all the services on the system and restarts them if they become unhealthy. With that, the IP address of that service is gonna change and \ac{lazy-koala-operator} is responsible to let the \ac{gazer} know of such changes.} + {Should have} + {UC-10, UC-11} + + + \functionalRequirement + {FR10} + {\ac{gazer} should react to config updates in realtime.} + {When a \ac{lazy-koala-operator} pushes a new config, \ac{gazer} should look for the new IPs without requiring a complete reset. Which is time consuming and expensive.} + {Could have} + {UC-10, UC-11} + + + \functionalRequirement + {FR11} + {Lazy Koala should provision an instance of \ac{sherlock} for each of the monitored services.} + {\ac{sherlock} is responsible for analyzing all the metric data that are exported from \ac{gazer}.} + {Must have} + {UC-10, UC-11} + + + \functionalRequirement + {FR12} + {\ac{sherlock} should periodically calculate the anomaly score for each of the monitored services and export it to Prometheus.} + {Anomaly score is used by the UI to understand the spread of an anomaly.} + {Must have} + {UC-08, UC-09} + + + \functionalRequirement + {FR13} + {\ac{lazy-koala-operator} should have a Web UI to visualize the service topology.} + {UI should help users to visualize the spread of an anomaly throughout all of the monitored service.} + {Should have} + {UC-04, UC-05, UC-06} + + + \functionalRequirement + {FR14} + {\ac{lazy-koala-operator} should add a finalizer for each of the provisioned resources.} + {Finalizers ensure the parent of a resource won’t be deleted before all the children are cleaned up. This avoids leaving the cluster with orphaned resources that won’t be cleaned up without user intervention.} + {Should have} + {UC-03} + + \functionalRequirement + {FR15} + {\ac{lazy-koala-operator} should periodically fine-tune models.} + {Microservices could get complex and change over time. To combat this \ac{lazy-koala-operator} make sure the models get adapted to the new changes done in monitored services.} + {Will not have} + {UC-04, UC-10} + + + + +\caption{Functional requirements (self-composed)} +\end{longtable} + +\subsection{Non-Functional Requirements} + +\begin{longtable}{|p{13mm}|p{89mm}|p{26mm}|p{18mm}|} +\hline + \textbf{ID} & + \textbf{Description} & + \textbf{Specification} & + \textbf{Priority Level} \\ \hline + + NFR1 & + \ac{lazy-koala-operator} should follow Principle of Least Privilege when accessing Kubernetes APIs. & + Security & + Must have \\ \hline + + NFR2 & + Systems should have fragmented architecture so each component can be individually scaled in order to save resources. & + Scalability & + Must have \\ \hline + + NFR3 & + Each component should work individually such that users can install parts of the system they are interested in. & + Usability & + Could have \\ \hline + + NFR4 & + \ac{gazer} should be limited for using only 100 mCPUs and 80MB of memory. & + Performance & + Should have \\ \hline + + NFR5 & + \ac{sherlock} should be limited to using only 100 mCPUs and 100MB of memory. & + Performance & + Could have \\ \hline + + NFR6 & + \ac{lazy-koala-operator} should be packaged as a Helm Chart for ease of use. & + Usability & + Must have \\ \hline + + NFR7 & + Reconstruction error of \ac{sherlock} should be under 0.1\% & + Performance & + Could have \\ \hline + + NFR8 & + \ac{lazy-koala-operator}’s reconciliation loop should use exponential backoff technique when there is an error while reconciling for a config change. & + Reliability & + Must have \\ \hline + + NFR9 & + Follow Coding best practices and rely on linters for code formatting. & + Maintainability & + Could have \\ \hline + + NFR10 & + The project should be backed by an automated CI/CD tool to test and build each component with each release. & + Maintainability & + Could have \\ \hline + +\caption{Non-Functional requirements (self-composed)} +\end{longtable} \ No newline at end of file diff --git a/documentation/PSPD/chapters/requirement-specification/rich-picture.tex b/documentation/PSPD/chapters/requirement-specification/rich-picture.tex new file mode 100644 index 0000000..e1480d1 --- /dev/null +++ b/documentation/PSPD/chapters/requirement-specification/rich-picture.tex @@ -0,0 +1,9 @@ +\section{Rich Picture} + +\begin{figure}[H] + \includegraphics[width=16cm]{assets/requirement-specification/rich-picture.png} + \caption{Rich picture diagram (self-composed)} + \label{fig:rich-picture} +\end{figure} + +The purpose of this project is to create an AI-powered monitoring system that will help \acp{sres} to detect and diagnose issues in the system quickly. The flow of the system goes as follows, The manager requests the development team to create a software to full fill a customer's need. Then the development team will develop the software and with the help of \acp{sres} the software will be deployed for public use. After that, it's \acp{sres} duty to keeping the system up and running while doing routine maintenance. While the software is running another monitoring system will keep an eye on its health and general behavior. If one or more components of the system try to deviate from its regular behavior it's the monitoring system's job to notify \acp{sres} of a few possible reasons for this unexpected behavior. Finally, \acp{sres} and the development team will launch a coordinated effort to identify the real issue and resolve it quickly as possible. \ No newline at end of file diff --git a/documentation/PSPD/chapters/requirement-specification/stakeholders.tex b/documentation/PSPD/chapters/requirement-specification/stakeholders.tex new file mode 100644 index 0000000..6f76b7c --- /dev/null +++ b/documentation/PSPD/chapters/requirement-specification/stakeholders.tex @@ -0,0 +1,76 @@ +\section{Stakeholder Analysis} + +To properly plan the system, the stakeholders and their roles need to be identified. Figure \ref{fig:stakeholder-onion} shows a visual representation of stakeholders using a onion model. + +\subsection{Onion Model} + +\begin{figure}[H] + \includegraphics[width=15cm]{assets/requirement-specification/onion-model.png} + \caption{Stakeholder onion model (self-composed)} + \label{fig:stakeholder-onion} +\end{figure} + + +\begin{longtable}{|p{35mm}|p{44mm}|p{72mm}|} + \hline + \textbf{Stakeholder} & + \textbf{Role} & + \textbf{Viewpoint} + \\ \hline + + Reliability Engineer & + Functional Beneficiary & + Use the proposed system to understand issues quickly. \\ \hline + + Software Engineer & + Functional Beneficiary & + Use the proposed system to debug issues quickly. \\ \hline + + Distributed System & + Functional Beneficiary & + Become more reliable thanks to lower down time and \ac{mttr}. \\ \hline + + Project Owner & + Financial Beneficiary & + Owning a very useful tool that can be licensed to enterprise companies. \\ \hline + + System Developer & + Financial Beneficiary & + Sharpen the development skills and developer portfolio while earning royalty. \\ \hline + + Support Engineer & + Functional Beneficiary & + Less time dealing with frustrated users. \\ \hline + + End Users & + Functional Beneficiary & + Enjoy a more reliable product. \\ \hline + + Data Scientist & + Functional Beneficiary & + Expand upon the concepts and models created for the project. \\ \hline + + InfoSec Analyst & + Functional Beneficiary & + Use the anomaly detection algorithm to find unusual activities. \\ \hline + + Evaluator & + Advisory & + Expand tools and technologies available in the field of reliability engineering. \\ \hline + + Supervisor & + Advisory & + Provide guidance to supervisee so they can successfully complete the project. \\ \hline + + Spammer & + Negative stakeholder & Creates abnormal behaviors to trigger false alarms and waste developer time and resources. \\ \hline + + Hacker & + Negative stakeholder & + Exploit the proposed system and gain illegal access to the monitored system. \\ \hline + + Competitor & + Negative stakeholder & + Try to replicate results to expand their market share. \\ \hline +\caption{Stakeholder description (self-composed)} +\end{longtable} \ No newline at end of file diff --git a/documentation/PSPD/chapters/requirement-specification/summary.tex b/documentation/PSPD/chapters/requirement-specification/summary.tex new file mode 100644 index 0000000..aae3008 --- /dev/null +++ b/documentation/PSPD/chapters/requirement-specification/summary.tex @@ -0,0 +1,3 @@ +\section{Chapter Summary} +% \vspace{-1mm} +This chapter started by explaining the stakeholders along with their involvement in this project. Then proceeded to explain the qualitative requirements engineering techniques used to identify the requirement of this project along with findings from each of the techniques. Next, the flow of information within the system was explained using a context diagram. Then, use cases of the system were derived from identified requirements, and main use cases were further explained with use case descriptions. Finally, the chapter was concluded with an overview of both functional and non-functional requirements along with their priority levels. \ No newline at end of file diff --git a/documentation/PSPD/chapters/requirement-specification/use-cases.tex b/documentation/PSPD/chapters/requirement-specification/use-cases.tex new file mode 100644 index 0000000..a7fc9a0 --- /dev/null +++ b/documentation/PSPD/chapters/requirement-specification/use-cases.tex @@ -0,0 +1,124 @@ +\section{Use Case Diagram} + +\begin{figure}[H] + \includegraphics[width=15cm]{assets/requirement-specification/use-case.png} + \caption{Use case diagram (self-composed)} + % \label{fig:poc-autoencoder} +\end{figure} + + +\newcommand{\UseCaseDescription}[9]{ + \textbf{} + \begin{longtable}{|p{40mm}|p{113mm}|} + \hline + \textbf{Use Case ID} & \textbf{#1} \\ \hline + \textbf{Use Case Name} & #2 \\ \hline + \textbf{Description} & #3 \\ \hline + \textbf{Participating actors} & #4 \\ \hline + \textbf{Preconditions} & #5 \\ \hline + \textbf{Extended use cases} & #6 \\ \hline + \textbf{Included use cases} & #7 \\ \hline + \textbf{Main flow} & #8 \\ \hline + \UseCaseDescriptionContinued#9 + % \caption{#2 (Self Composed)} + \end{longtable} +} + +\newcommand{\UseCaseDescriptionContinued}[3]{ + \textbf{Alternative flows} & #1 \\ \hline + \textbf{Exceptional flows} & #2 \\ \hline + \textbf{Postconditions} & #3 \\ \hline +} + +\newenvironment{CompactItemizes} +{ \vspace{-8mm}\begin{itemize}[leftmargin=*,noitemsep,nolistsep]} +{ \vspace{-7mm}\end{itemize}} + +\newenvironment{CompactEnumerate} +{ \vspace{-8mm}\begin{enumerate}[leftmargin=*,noitemsep,nolistsep]} +{ \vspace{-7mm}\end{enumerate}} + + +\section{Use Case Descriptions} +Due to the page limits, only the main use-case description is present here. Please find the refer of the use-case descriptions in Appendix-\ref{appendix:use-case-description}. + +\vspace{-2em} +\UseCaseDescription +{UC-04} +{Check for Root Courses} +{Look at the service topology graph to find out the root course of an issue.} +{Software Engineer\newline +Reliability Engineer} +{\begin{CompactItemizes} + \item kubectl installed and configured to talk to a Kubernetes cluster. + \item The Kubernetes cluster has the \ac{lazy-koala-operator} deployed. + \item Established port forwarding connection with \ac{lazy-koala-operator}. +\end{CompactItemizes}} +{N/A} +{Generate Report\newline +Read from the database} +{\begin{CompactEnumerate} + \item Visit the forwarded port on the local machine. + \item Open Monitor tab. + \item Inspect the graph. +\end{CompactEnumerate}} +{{N/A} +{\textbf{E1}: the \ac{lazy-koala-operator} returns non 200 status code. +\vspace{-4mm}\begin{enumerate} + \item Show the error to the user. +\vspace{-7mm}\end{enumerate}} +{N/A}} + + +\vspace{-2em} +\UseCaseDescription +{UC-07} +{Extract telemetry} +{Every 5 second \ac{gazer} will scrape the metric server} +{System Timer} +{\begin{CompactItemizes} + \item \ac{gazer} is deployed to the cluster +\end{CompactItemizes}} +{N/A} +{Write to the database} +{\begin{CompactEnumerate} + \item poll\_kube\_api function get invoked. + \item \ac{gazer} looks at the config file and finds out the service it’s responsible for. + \item Query metric server for each of the service names. + \item Store it in local memory. +\end{CompactEnumerate}} +{{N/A} +{\textbf{E1}: metric server returns non 200 status code. +\vspace{-4mm}\begin{enumerate} + \item Retry in the next iteration. +\vspace{-7mm}\end{enumerate}} +{\begin{CompactItemizes} + \item Updated local memory with recent telemetry data. +\end{CompactItemizes}}} + +\vspace{-2em} +\UseCaseDescription +{UC-09} +{Check for Anomalies} +{Check for Anomalies in each of the service} +{System Timer} +{\begin{CompactItemizes} + \item An Instance of \ac{sherlock} is deployed. +\end{CompactItemizes}} +{N/A} +{Write to the database} +{\begin{CompactEnumerate} + \item check\_anomlies function invoked. + \item Query the database for telemetry for about the last 5 minutes. + \item Do a forward pass on the model. + \item Calculate the reconstruction loss. + \item Store it in local memory. +\end{CompactEnumerate}} +{{N/A} +{\textbf{E1}: Database is unreachable +\vspace{-4mm}\begin{enumerate} + \item Retry in the next iteration. +\vspace{-7mm}\end{enumerate}} +{\begin{CompactItemizes} + \item Updated local memory with current reconstruction loss. +\end{CompactItemizes}}} diff --git a/documentation/PSPD/chapters/system-design/chapter-overview.tex b/documentation/PSPD/chapters/system-design/chapter-overview.tex new file mode 100644 index 0000000..9d4d96b --- /dev/null +++ b/documentation/PSPD/chapters/system-design/chapter-overview.tex @@ -0,0 +1,3 @@ +\section{Chapter Overview} + +This chapter focuses on the overall design of the proposed system. First, the author will discuss the design goals when it comes to creating this system and the philosophies behind it. Then the system architecture will be explained and how each layer on the system integrates with each other. Finally, the chapter will conclude with the design of the proposed system along with the low fidelity UI design which will showcase how the UIs of the system will look. \ No newline at end of file diff --git a/documentation/PSPD/chapters/system-design/chapter-summary.tex b/documentation/PSPD/chapters/system-design/chapter-summary.tex new file mode 100644 index 0000000..5c82494 --- /dev/null +++ b/documentation/PSPD/chapters/system-design/chapter-summary.tex @@ -0,0 +1,4 @@ +\section{Chapter Summary} + +After discovering requirements through many methods, this chapter fouces on creating a system around those requirements. The chapter started with design goals the author has set for himself and moved over to explain all the layers of the system in detail with each component. Finally, the chapter was concluded with an explanation about the design paradigm and diagrams associated with it. + diff --git a/documentation/PSPD/chapters/system-design/design-goals.tex b/documentation/PSPD/chapters/system-design/design-goals.tex new file mode 100644 index 0000000..d392b1d --- /dev/null +++ b/documentation/PSPD/chapters/system-design/design-goals.tex @@ -0,0 +1,22 @@ +\section{Design Goals} +\begin{longtable}{|p{22mm}|p{131mm}|} +\hline +\textbf{Design Goal} & + \textbf{Description} \\ \hline + Modularity & + Since this is designed to work in a cloud-native environment, it’s considered best practice to have all the components loosely coupled. During the requirement engineering phase two of the industry experts expressed their interests to integrate this project into some of their existing toolings. So having moduler design will help their efforts too. \\ \hline + + Lightweight & + As this was designed to be a supporting system to existing distributed systems, it needs to be as lightweight as possible to justify the use of this. If the supporting system is consuming more resources than the target system it won’t be practical to use. \\ \hline + + No Code Change & + It’s unlikely for developers to update all the services in a distributed system to match with a monitoring system. So to increase adaptability, this system should be able to work without any instrumentation from the developers’ side. \\ \hline + + Extensibility & + One of the core goals of this project is to be a starting place for future researchers who are looking into root cause analysis. So having this toolkit extensible will greatly help their efforts. \\ \hline + + Scalability & + Since the main target audience of this product is large enterprises with huge systems, this system should be able to scale up to their level in order to be relevant. \\ \hline + + \caption{Project design goals (self-composed)} +\end{longtable} \ No newline at end of file diff --git a/documentation/PSPD/chapters/system-design/main.tex b/documentation/PSPD/chapters/system-design/main.tex new file mode 100644 index 0000000..a99550e --- /dev/null +++ b/documentation/PSPD/chapters/system-design/main.tex @@ -0,0 +1,9 @@ + +% \vspace{-1mm} +\chapter{Initial System Design} + +\input{chapters/system-design/chapter-overview} +\input{chapters/system-design/design-goals} +\input{chapters/system-design/system-architecture} +\input{chapters/system-design/system-design} +\input{chapters/system-design/chapter-summary} \ No newline at end of file diff --git a/documentation/PSPD/chapters/system-design/system-architecture.tex b/documentation/PSPD/chapters/system-design/system-architecture.tex new file mode 100644 index 0000000..68dd544 --- /dev/null +++ b/documentation/PSPD/chapters/system-design/system-architecture.tex @@ -0,0 +1,39 @@ +\section{System Architecture} + +System architecture design gives a birds-eye view of how all the components in the system communicate with each other. This helps us to understand the dependencies and responsibilities of each component. Since this system is designed to run on a microservices-based environment, an n-tier design architecture was used to physically separate the components in the system to have better a reliability and scalability. + +\begin{figure}[H] + \includegraphics[width=14cm]{assets/system-design/tier-architecture.png} + \caption{Tiered architecture (self-composed))} + \label{fig:tier-architecture} +\end{figure} + +\subsection{Presentation Tier} + +The presentation tier will be entirely running in the client's computer while depending on the logic tier for data. + +\begin{itemize} + \item \textbf{Monitoring Dashboard} - This view is responsible for helping the user to understand the service topology and visually identify issues in the system. + \item \textbf{Settings View} - On the settings page users can choose which services are needed to be monitored along with their DNS address. +\end{itemize} + +\subsection{Logic Tier} + +The logic tier will contain three custom microservices that depend on Kubernetes's core modules to operate. + +\begin{itemize} + \item \textbf{\ac{lazy-koala-operator}} - The \ac{lazy-koala-operator} is the main bridge between Kubernetes APIs and this system. It also contains a proxy server that redirects incoming client requests to kube-api. + \item \textbf{\ac{gazer}} - An instance of \ac{gazer} will be running on every node in the Kubernetes cluster which passively extracts telemetry and sends them over to the Prometheus server. + \item \textbf{\ac{sherlock}} - AI engine periodically query Prometheus to get the current status of all the monitored service. Then it calculates an anomaly score for each service and pushes it to Prometheus so it can be sent back to the presentation layer + \item \textbf{kube-apiserver} - This is an API provided by Kubernetes that help to read and update the cluster status programmatically. + \item \textbf{kube-scheduler} - kube-scheduler is responsible for smartly provision requested resources in available spaces. + \item \textbf{kube-controller-manager} - This service send updates to all the operators running on the cluster whenever there is a change to a resource that was owned by the specific operator. +\end{itemize} + +\subsection{Data Tier} + +\begin{itemize} + \item \textbf{Artifact registry} - All the pre-trained models and built containers will be saved here for easy access. + \item \textbf{Prometheus} - Prometheus is a time-series database that is highly optimized for storing service telemetry. + \item \textbf{etcd} - etcd is an in-memory database that will be responsible for holding the resources specifications and \ac{gazer} config. +\end{itemize} \ No newline at end of file diff --git a/documentation/PSPD/chapters/system-design/system-design.tex b/documentation/PSPD/chapters/system-design/system-design.tex new file mode 100644 index 0000000..6832a8c --- /dev/null +++ b/documentation/PSPD/chapters/system-design/system-design.tex @@ -0,0 +1,67 @@ +\section{System Design} + +\subsection{Design Paradigm} + +When building a software application there are two main design paradigms to choose from to organize the code base. Object-Oriented Analysis and Design (OOAD) which is very popular among programming languages such as Java and C\# is a way of mimicking the behavior of real-world objects and how they interact in the real-world. However, this project has a lot of components that are loosely coupled and implemented in many different languages and frameworks. So Structured Systems Analysis and Design (SSADM) was chosen as the design paradigm for this project. + +\subsection{Data-flow diagram} + +The Data-flow diagram explains the flow of request data within the system and how each process in the system interacts with each other at a high level. + +\begin{figure}[H] + \includegraphics[width=13cm]{assets/system-design/data-flow-level-1.png} + \caption{Data-flow diagram - level 1 (self-composed)} + % \label{fig:data-flow} +\end{figure} + +\begin{figure}[H] + \includegraphics[width=15cm]{assets/system-design/data-flow-level-2.png} + \caption{Data-flow diagram - level 2 (self-composed)} + % \label{fig:data-flow} +\end{figure} + + +\subsection{Sequence Diagram} + +Sequences diagrams are meant to showcase the flow of instructions within sub-components of the system. Digrams below explains how the system reacts when two of the main core functionality are invoked. + +\begin{figure}[H] + \centering + \begin{subfigure}[b]{0.70\textwidth} + \centering + \includegraphics[width=\textwidth]{assets/system-design/sequence-diagram-1.png} + \caption{Check for root cause} + \end{subfigure} + \hfill + \begin{subfigure}[b]{0.70\textwidth} + \centering + \includegraphics[width=\textwidth]{assets/system-design/sequence-diagram-2.png} + \caption{Calculate anomaly score} + \end{subfigure} + \hfill + \caption{Sequence diagrams (self-composed)} +\end{figure} + +\subsection{UI Design} + +Since this project was developed as a Kubernetes native application most of the functionality work as a daemon process in the background. However, there are two use cases where having a visual user interface greatly increases the usability of this project. UI mockups attached below showcase two of those use-cases. Figure \ref{fig:ui-home} displays how developers will be able to inspect the topology of the system and find issues in a realtime while, Figure \ref{fig:ui-settings} showcase the settings page which is used to tag interested services in the system which needs to be monitored. + +\begin{figure}[H] + \centering + \begin{subfigure}[b]{0.75\textwidth} + \centering + \includegraphics[width=\textwidth]{assets/system-design/ui-home.png} + \caption{Inspector View} + \label{fig:ui-home} + \end{subfigure} + \hfill + \begin{subfigure}[b]{0.75\textwidth} + \centering + \includegraphics[width=\textwidth]{assets/system-design/ui-settings.png} + \caption{Settings View} + \label{fig:ui-settings} + \end{subfigure} + \hfill + % \label{fig:ui-mocks} + \caption{UI mockups (self-composed)} +\end{figure} \ No newline at end of file diff --git a/documentation/PSPD/cover-page.tex b/documentation/PSPD/cover-page.tex new file mode 100644 index 0000000..289c8de --- /dev/null +++ b/documentation/PSPD/cover-page.tex @@ -0,0 +1,50 @@ +% TITLE PAGE--------------------------------------------------- +\begin{titlepage} + +\begin{tikzpicture}[remember picture, overlay] + \draw[line width = 1pt] ($(current page.north west) + (1cm,-1cm)$) rectangle ($(current page.south east) + (-1cm,1cm)$); +\end{tikzpicture} + +\begin{center} + + % Upper part of the page +\text{\large Informatics Institute of Technology}\\[0.1cm] +\text{\large In Collaboration With}\\[0.1cm] +\text{\large University of Westminster, UK}\\[0.4cm] + + +% Logo +\includegraphics[height=6.0cm]{assets/uow-logo.png}\\[1.3cm] + +% \setstretch{2.5} +% Title +{\fontfamily{qtm}\selectfont\setstretch{2} +{ \Huge Lazy-Koala: A Lazy Approach for Root Cause Analysis in Distributed Systems }\\[1.7cm] +} +% \begin{minipage}{0.45\textwidth} +\text{\LARGE Project Specifications Design and Prototype}\\[1.5cm] + +% Authors +% \text{\large A dissertation by}\\[0.1cm] +% \text{\large \textbf{Mr. Isala Piyarisi}}\\[0.1cm] +% \text{\large w1742118 / 2018421}\\[3.2cm] + +% % Supervisor +% \text{\large Supervised by}\\[0.1cm] +% \text{\large \textbf{Mr. Guhanathan Poravi}}\\[3.1cm] + +% Supervisor +\text{\large \textbf{Supervisor}: Guhanathan Poravi}\\[0.1cm] +\text{\large \textbf{Date}: $3^{rd} $ March 2022}\\[0.1cm] +\text{\large \textbf{Department}: Computer Science}\\[3cm] +% \text{\large \textbf{Keywords}: Cloud Computing, AIOps, Monitoring, Disaster Recovery}\\[5cm] + + +\large{Submitted in partial fulfilment of the requirements for the +BSc(Hons) Computer Science degree at the University of Westminster.} \\[0.5cm] +% \large{May 2022} + + +\end{center} + +\end{titlepage} \ No newline at end of file diff --git a/documentation/PSPD/main.tex b/documentation/PSPD/main.tex new file mode 100644 index 0000000..dc2d0ef --- /dev/null +++ b/documentation/PSPD/main.tex @@ -0,0 +1,69 @@ +\documentclass[12pt]{report} + +% Include all packages from file. +\input{preamble} + +\renewcommand{\contentsname}{Table of Contents} +\renewcommand{\baselinestretch}{1.5} + +% HEADER AND FOOTER-------------------------- +\patchcmd{\chapter}{\thispagestyle{plain}}{\thispagestyle{fancy}}{}{} +\pagestyle{fancy} +\fancyhf{} +\lhead{\fontsize{10}{12}\leavevmode\selectfont\color{gray}{Lazy-Koala: A Lazy Approach for Root Cause Analysis in Distributed Systems}} +\rfoot{\fontsize{10}{12}\leavevmode\selectfont\color{gray}{\thepage}} +\lfoot{\fontsize{10}{12}\leavevmode\selectfont\color{gray}{Isala Piyarisi | 2018421}} +\renewcommand{\headrulewidth}{0pt} +% END HEADER AND FOOTER-------------------------- + + +% Document begins here +\begin{document} + +\input{cover-page} + +% Page Numbering--------------------------------------------- +\pagenumbering{roman} + +\input{chapters/abstract} +\addcontentsline{toc}{chapter}{Abstract} + +% Table of Contents--------------------------------------------------- +\tableofcontents +% List of Figures--------------------------------------------------- +% \cleardoublepage +{\let\clearpage\relax +\listoffigures +} +\addcontentsline{toc}{chapter}{\listfigurename} +% List of Tables--------------------------------------------------- +{\let\clearpage\relax +\listoftables +} +\addcontentsline{toc}{chapter}{\listtablename} + +% \cleardoublepage +\phantomsection +% Include acronyms +\addcontentsline{toc}{chapter}{List of Acronyms} +% {\let\clearpage\relax +\input{acronym} +% } +\zerospacingchapter +\input{chapters/introduction/main} +\input{chapters/requirement-specification/main} +\input{chapters/system-design/main} +\input{chapters/implementation/main} + +\cleardoublepage +\phantomsection +\renewcommand{\bibname}{References} +\pagenumbering{Roman} +% \setcounter{page}{5} +\addcontentsline{toc}{chapter}{References} +\bibliography{references.bib} + +\input{chapters/appendix/main} + +\end{document} + diff --git a/documentation/PSPD/preamble.tex b/documentation/PSPD/preamble.tex new file mode 100644 index 0000000..84a149c --- /dev/null +++ b/documentation/PSPD/preamble.tex @@ -0,0 +1,106 @@ +\usepackage[UKenglish]{babel} + +\usepackage{setspace} % LINE SPACING +\usepackage{tikz} +\usepackage{titlepic} +\usepackage{graphicx} +\usepackage{newtxtext,newtxmath} +\usepackage[utf8]{inputenc} +\usepackage[a4paper,left=24.5mm, right=24.5mm, top=24.5mm, bottom=24.5mm]{geometry} %Paper margins +\usepackage{titlesec} +\usepackage{enumitem} % Custom enumerations +\usepackage[titles]{tocloft} + +\usepackage{acronym} + +\usepackage{hyperref} % Links + +% Referencing +\usepackage{natbib} % Harvard referencing +% \usepackage[comma,colon]{natbib} +\citestyle{aysep={,}} +\bibliographystyle{agsm} + +% Tables +\usepackage{longtable} +\usepackage{multirow} +\usepackage{xstring} +\usepackage{geometry} +\usepackage{array} + + +% Fonts size +\usepackage[font={small,it}]{caption} + +% Date +\usepackage[useregional]{datetime2} + +\usetikzlibrary{calc} +\newcommand\HRule{\rule{\textwidth}{1pt}} +\newcommand{\hsp}{\hspace{10pt}} +\titlespacing*{\chapter}{0pt}{20pt}{20pt} % CHAPTER SPACING +\titleformat{\chapter}[hang]{\Huge\bfseries}{Chapter \thechapter\hsp:\hsp }{0pt}{\Huge\bfseries} +\titleformat{\chapter}[display]{\fontsize{20pt}{0pt}\bfseries}{}{2pt}{} +\setlength{\parindent}{10ex} + + +% % Setting TOC and Section number depth +% \setcounter{tocdepth}{4} +\setcounter{secnumdepth}{4} +% % line spacing in TOC +\setlength{\cftbeforechapskip}{3pt} + +\onehalfspacing + +\usepackage{chngcntr} + +% \usepackage[nottoc,numbib]{tocbibind} +\counterwithout{figure}{chapter} +\counterwithout{table}{chapter} + +\usepackage[font={small,it}]{caption} +\selectlanguage{UKenglish} + +\usepackage{floatrow} + +% Header and footer +\usepackage{fancyhdr} +\usepackage{etoolbox} + +\usepackage[titletoc]{appendix} +\usepackage{changepage} + + +% TITLE FORMATS +% CHAPTER SPACING & FONT +\titleformat{\chapter}[hang]{\fontsize{16pt}{0pt}\bfseries}{\thechapter}{1em}{} +% \titlespacing*{\chapter}{0pt}{20pt}{10pt} + +% SECTION FONT +\titleformat{\section}[hang]{\fontsize{14pt}{0}\bfseries}{\thesection}{1em}{} +% \titlespacing*{\section}{0pt}{20pt}{10pt} + +% SUBSECTION FONT AND SIZE +\titleformat{\subsection}[hang]{\fontsize{12pt}{0}\bfseries}{\thesubsection}{1em}{} +% \titlespacing*{\subsection}{0pt}{15pt}{10pt} + +\titleformat{\subsubsection}[hang]{\fontsize{12pt}{0}\itshape}{\thesubsubsection}{1em}{} +% \titlespacing*{\subsubsection}{0pt}{15pt}{10pt} + +\usepackage{chngcntr} +\counterwithin{table}{chapter} +\counterwithin{figure}{chapter} + +\usepackage{makecell} +\usepackage{xargs} +% \usepackage{paralist} + +\usepackage{caption} +\usepackage{subcaption} + +% \usepackage{acmart} +\newcommand*{\zerospacingchapter}{% + % CHAPTER SPACING & FONT +\titleformat{\chapter}[hang]{\fontsize{16pt}{0pt}\bfseries}{\thechapter}{1em}{} +\titlespacing*{\chapter}{0pt}{-10pt}{10pt} +} \ No newline at end of file diff --git a/documentation/PSPD/references.bib b/documentation/PSPD/references.bib new file mode 100644 index 0000000..2b4cb5e --- /dev/null +++ b/documentation/PSPD/references.bib @@ -0,0 +1,715 @@ +% Introduction +% Background +@misc{rimol_2021, + title = {Gartner Says Worldwide IaaS Public Cloud Services Market Grew 40.7\% in 2020}, + author = {Rimol, Meghan}, + year = 2021, + month = {Jun}, + journal = {Gartner}, + howpublished = {\url{https://www.gartner.com/en/newsroom/press-releases/2021-06-28-gartner-says-worldwide-iaas-public-cloud-services-market-grew-40-7-percent-in-2020}} +} +@misc{LessonsF52:online, + title = {Lessons From the Birth of Microservices at Google}, + author = {Daniel Spoonhower}, + year = 2018, + month = 12, + note = {(Accessed on 09/22/2021)}, + howpublished = {\url{https://dzone.com/articles/lessons-from-the-birth-of-microservices-at-google}} +} +@inproceedings{di2018migrating, + title = {Migrating towards microservice architectures: an industrial survey}, + author = {Di Francesco, Paolo and Lago, Patricia and Malavolta, Ivano}, + year = 2018, + booktitle = {2018 IEEE International Conference on Software Architecture (ICSA)}, + pages = {29--2909}, + organization = {IEEE} +} +@misc{Microser52:online, + title = {Microservices Adoption in 2020 – O’Reilly}, + author = {Mike Loukides, Steve Swoyer}, + year = 2020, + month = {07}, + note = {(Accessed on 09/22/2021)}, + howpublished = {\url{https://www.oreilly.com/radar/microservices-adoption-in-2020/}} +} +@misc{Understa56:online, + title = {Understanding cloud-native apps}, + author = {RedHat}, + year = {}, + month = {}, + note = {(Accessed on 08/26/2021)}, + howpublished = {\url{https://www.redhat.com/cloud-native-apps}} +} +@misc{Whataret68:online, + title = {What are the Benefits of CI/CD?}, + author = {JetBrains}, + year = {}, + month = {}, + note = {(Accessed on 08/26/2021)}, + howpublished = {\url{https://www.jetbrains.com/teamcity/ci-cd-guide/benefits-of-ci-cd/}} +} +@misc{5WaysYou35:online, + title = {5 Ways You're Probably Messing Up Your Microservices | OverOps}, + author = {Alex Zhitnitsky}, + year = 2019, + month = 5, + note = {(Accessed on 08/26/2021)}, + howpublished = {\url{https://www.overops.com/blog/5-ways-to-not-f-up-your-microservices-in-production/}} +} +% Problem + +% Research Motivation +@misc{Untangli35:online, + title = {Untangling Microservices or Balancing Complexity in Distributed Systems}, + author = {Vladik Khononov}, + year = 2020, + month = 4, + note = {(Accessed on 08/31/2021)}, + howpublished = {\url{https://blog.doit-intl.com/untangling-microservices-or-balancing-complexity-in-distributed-systems-7759987d44b1}} +} +@misc{OpenAI_dota, + title = {OpenAI Five}, + author = {OpenAI}, + year = 2018, + howpublished = {\url{https://blog.openai.com/openai-five/}} +} +@article{silver2017mastering, + title = {Mastering the game of go without human knowledge}, + author = {Silver, David and Schrittwieser, Julian and Simonyan, Karen and Antonoglou, Ioannis and Huang, Aja and Guez, Arthur and Hubert, Thomas and Baker, Lucas and Lai, Matthew and Bolton, Adrian and others}, + year = 2017, + journal = {nature}, + publisher = {Nature Publishing Group}, + volume = 550, + number = 7676, + pages = {354--359} +} +% Related Works +@inproceedings{du2018anomaly, + title = {Anomaly detection and diagnosis for container-based microservices with performance monitoring}, + author = {Du, Qingfeng and Xie, Tiandi and He, Yu}, + year = 2018, + booktitle = {International Conference on Algorithms and Architectures for Parallel Processing}, + pages = {560--572}, + organization = {Springer} +} +@inproceedings{kumarage2018anomaly, + title = {Anomaly Detection in Industrial Software Systems-Using Variational Autoencoders.}, + author = {Kumarage, Tharindu and De Silva, Nadun and Ranawaka, Malsha and Kuruppu, Chamal and Ranathunga, Surangika}, + year = 2018, + booktitle = {ICPRAM}, + pages = {440--447} +} +@inproceedings{kumarage2019generative, + title = {Generative adversarial networks (GAN) based anomaly detection in industrial software systems}, + author = {Kumarage, Tharindu and Ranathunga, Surangika and Kuruppu, Chamal and De Silva, Nadun and Ranawaka, Malsha}, + year = 2019, + booktitle = {2019 Moratuwa Engineering Research Conference (MERCon)}, + pages = {43--48}, + organization = {IEEE} +} +@article{oord2016wavenet, + title = {Wavenet: A generative model for raw audio}, + author = {Oord, Aaron van den and Dieleman, Sander and Zen, Heiga and Simonyan, Karen and Vinyals, Oriol and Graves, Alex and Kalchbrenner, Nal and Senior, Andrew and Kavukcuoglu, Koray}, + year = 2016, + journal = {arXiv preprint arXiv:1609.03499} +} +@inproceedings{chigurupati2017root, + title = {Root cause analysis using artificial intelligence}, + author = {Chigurupati, Asha and Lassar, Noah}, + year = 2017, + booktitle = {2017 Annual reliability and maintainability symposium (RAMS)}, + pages = {1--5}, + organization = {IEEE} +} +@article{gonzalez2017root, + title = {Root cause analysis of network failures using machine learning and summarization techniques}, + author = {Gonzalez, Jose Manuel Navarro and Jimenez, Javier Andion and Lopez, Juan Carlos Duenas and others}, + year = 2017, + journal = {IEEE Communications Magazine}, + publisher = {IEEE}, + volume = 55, + number = 9, + pages = {126--131} +} +@inproceedings{samir2019dla, + title = {DLA: Detecting and localizing anomalies in containerized microservice architectures using markov models}, + author = {Samir, Areeg and Pahl, Claus}, + year = 2019, + booktitle = {2019 7th International Conference on Future Internet of Things and Cloud (FiCloud)}, + pages = {205--213}, + organization = {IEEE} +} +@inproceedings{wu2020microrca, + title = {Microrca: Root cause localization of performance issues in microservices}, + author = {Wu, Li and Tordsson, Johan and Elmroth, Erik and Kao, Odej}, + year = 2020, + booktitle = {NOMS 2020-2020 IEEE/IFIP Network Operations and Management Symposium}, + pages = {1--9}, + organization = {IEEE} +} +@article{wang2020generalizing, + title = {Generalizing from a few examples: A survey on few-shot learning}, + author = {Wang, Yaqing and Yao, Quanming and Kwok, James T and Ni, Lionel M}, + year = 2020, + journal = {ACM Computing Surveys (CSUR)}, + publisher = {ACM New York, NY, USA}, + volume = 53, + number = 3, + pages = {1--34} +} +@inproceedings{zhang2019deep, + title = {A deep neural network for unsupervised anomaly detection and diagnosis in multivariate time series data}, + author = {Zhang, Chuxu and Song, Dongjin and Chen, Yuncong and Feng, Xinyang and Lumezanu, Cristian and Cheng, Wei and Ni, Jingchao and Zong, Bo and Chen, Haifeng and Chawla, Nitesh V}, + year = 2019, + booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence}, + volume = 33, + pages = {1409--1416} +} +% Approach + + +% Research Gap +@misc{GoingtoM51:online, + title = {Going to Market Faster: Most Companies Are Deploying Code Weekly, Daily, or Hourly}, + author = {Asami Novak}, + year = 2016, + month = {02}, + note = {(Accessed on 09/04/2021)}, + howpublished = {\url{https://newrelic.com/blog/best-practices/data-culture-survey-results-faster-deployment}} +} +@inproceedings{ribeiro2016should, + title = {"Why should i trust you?" Explaining the predictions of any classifier}, + author = {Ribeiro, Marco Tulio and Singh, Sameer and Guestrin, Carlos}, + year = 2016, + booktitle = {Proceedings of the 22nd ACM SIGKDD international conference on knowledge discovery and data mining}, + pages = {1135--1144} +} +% Research Challenges +@misc{WhatisCo78:online, + title = {What is Container Orchestration?}, + author = {IBM}, + year = 2021, + month = {05}, + note = {(Accessed on 09/14/2021)}, + howpublished = {\url{https://www.ibm.com/cloud/learn/container-orchestration}} +} +@misc{Googlead4:online, + title = {Google admits Kubernetes container tech is so complex, it's had to roll out an Autopilot feature to do it all for you • The Register}, + author = {Tim Anderson}, + year = 2021, + month = {02}, + note = {(Accessed on 09/14/2021)}, + howpublished = {\url{https://www.theregister.com/2021/02/25/google_kubernetes_autopilot/}} +} +% Research methods +@misc{1Philoso75:online, + title = {Philosophy of Science | Four Major Paradigms}, + author = {Ziaul Haque Munim}, + year = 2019, + month = {02}, + note = {(Accessed on 09/08/2021)}, + howpublished = {\url{https://www.youtube.com/watch?v=n8B50HJrAv0}} +} +@misc{Pragmati87:online, + title = {Pragmatism Research Philosophy}, + author = {John Dudovskiy}, + year = {}, + month = {}, + note = {(Accessed on 09/08/2021)}, + howpublished = {\url{https://research-methodology.net/research-philosophy/pragmatism-research-philosophy/}} +} +@misc{WhatAreT79:online, + title = {What Are The Top 8 Project Management Methodologies?}, + author = {Erica Chappell}, + year = 2021, + month = {01}, + note = {(Accessed on 09/22/2021)}, + howpublished = {\url{https://clickup.com/blog/project-management-methodologies/#36-7-what-is-the-prince2-methodology}} +} + + + + + + + + + + + + + + + + + + + + + + + + + + +%%% LR + +% Domain Overview + +@book{kim2014phoenix, + title={The Phoenix Project: A Novel About IT, DevOps, and Helping Your Business Win}, + author={Kim, G. and Behr, K. and Spafford, K.}, + isbn={9780988262584}, + url={https://books.google.lk/books?id=qaRODgAAQBAJ}, + year={2014}, + publisher={IT Revolution Press} +} + +@book{liker2006toyota, + title={Toyota way fieldbook}, + author={Liker, Jeffrey K and Meier, David}, + year={2006}, + publisher={McGraw-Hill Education} +} + +@book{beyer2016site, + title={Site Reliability Engineering: How Google Runs Production Systems}, + author={Beyer, B. and Jones, C. and Petoff, J. and Murphy, N.R.}, + isbn={9781491929124}, + lccn={2017304248}, + url={https://books.google.lk/books?id=81UrjwEACAAJ}, + year={2016}, + publisher={O'Reilly Media, Incorporated} +} + +@misc{Googleoutage:online, +author = {}, +title = {Google Cloud Incident \#20013}, +howpublished = {\url{https://status.cloud.google.com/incident/zall/20013}}, +month = 12, +year = 2020, +note = {(Accessed on 12/03/2021)} +} + +@misc{Artifici8:online, +author = {Ben Linders}, +title = {Artificial Intelligence for IT Operations: an Overview}, +howpublished = {\url{https://www.infoq.com/news/2021/07/AI-IT-operations/}}, +month = {7}, +year = {2021}, +note = {(Accessed on 12/04/2021)} +} + + +% Problem +@misc{CloudAdo16:online, + title = {Cloud Adoption Statistics - It's Everywhere \& Everyone's Using It in 2021!}, + author = {Nick Galov}, + year = 2021, + month = 9, + note = {(Accessed on 08/27/2021)}, + howpublished = {\url{https://hostingtribunal.com/blog/cloud-adoption-statistics/}} +} +% Research Motivation + +% Related Works +@misc{Watchdog76:online, + title = {Watchdog: Auto-Detect Performance Anomalies Without Setting Alerts | Datadog}, + author = {Brad Menezes}, + year = 2018, + month = 7, + note = {(Accessed on 08/29/2021)}, + howpublished = {\url{https://www.datadoghq.com/blog/watchdog/}} +} +@inproceedings{prabodha2017monitoring, + title = {Monitoring Health of Large Scale Software Systems Using Drift Detection Techniques}, + author = {Prabodha, LHC and Vithanage, WRR and Ranaweera, LT and Dissanayake, DMMAIB and Ranathunga, Surangika}, + year = 2017, + booktitle = {Conference on Complex, Intelligent, and Software Intensive Systems}, + pages = {152--163}, + organization = {Springer} +} + +@inproceedings{kim2018encoding, + title = {An encoding technique for CNN-based network anomaly detection}, + author = {Kim, Taejoon and Suh, Sang C and Kim, Hyunjoo and Kim, Jonghyun and Kim, Jinoh}, + year = 2018, + booktitle = {2018 IEEE International Conference on Big Data (Big Data)}, + pages = {2960--2965}, + organization = {IEEE} +} +@inproceedings{dasgupta2002anomaly, + title = {Anomaly detection in multidimensional data using negative selection algorithm}, + author = {Dasgupta, Dipankar and Majumdar, Nivedita Sumi}, + year = 2002, + booktitle = {Proceedings of the 2002 Congress on Evolutionary Computation. CEC'02 (Cat. No. 02TH8600)}, + volume = 2, + pages = {1039--1044}, + organization = {IEEE} +} + +@article{soldani2021anomaly, + title = {Anomaly Detection and Failure Root Cause Analysis in (Micro) Service-Based Cloud Applications: A Survey}, + author = {Soldani, Jacopo and Brogi, Antonio}, + year = 2021, + journal = {arXiv preprint arXiv:2105.12378} +} +@inproceedings{hagemann2020systematic, + title = {A Systematic Review on Anomaly Detection for Cloud Computing Environments}, + author = {Hagemann, Tanja and Katsarou, Katerina}, + year = 2020, + booktitle = {2020 3rd Artificial Intelligence and Cloud Computing Conference}, + pages = {83--96} +} +@article{hinton2006reducing, + title = {Reducing the dimensionality of data with neural networks}, + author = {Hinton, Geoffrey E and Salakhutdinov, Ruslan R}, + year = 2006, + journal = {science}, + publisher = {American Association for the Advancement of Science}, + volume = 313, + number = 5786, + pages = {504--507} +} +@article{goodfellow2014generative, + title = {Generative adversarial nets}, + author = {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua}, + year = 2014, + journal = {Advances in neural information processing systems}, + volume = 27 +} +@article{donahue2016adversarial, + title = {Adversarial feature learning}, + author = {Donahue, Jeff and Kr{\"a}henb{\"u}hl, Philipp and Darrell, Trevor}, + year = 2016, + journal = {arXiv preprint arXiv:1605.09782} +} +@article{buckland1994relationship, + title = {The relationship between recall and precision}, + author = {Buckland, Michael and Gey, Fredric}, + year = 1994, + journal = {Journal of the American society for information science}, + publisher = {Wiley Online Library}, + volume = 45, + number = 1, + pages = {12--19} +} +@misc{Accuracy18:online, + title = {Accuracy vs. F1-Score}, + author = {urva Huilgol}, + year = 2019, + month = {08}, + note = {(Accessed on 10/20/2021)}, + howpublished = {\url{https://medium.com/analytics-vidhya/accuracy-vs-f1-score-6258237beca2}} +} +% Approach +@misc{WhatisaS27:online, + title = {What is a System Monitor?}, + author = {G. Wiesen}, + year = {}, + month = {}, + note = {(Accessed on 10/02/2021)}, + howpublished = {\url{https://www.easytechjunkie.com/what-is-a-system-monitor.htm}} +} +@misc{Whatisvi12:online, + title = {What is virtualization?}, + author = {}, + year = {}, + month = {}, + note = {(Accessed on 10/02/2021)}, + howpublished = {\url{https://www.redhat.com/en/topics/virtualization/what-is-virtualization#history-of-virtualization}} +} +@article{Mergen_Uhlig_Krieger_Xenidis_2006, + title = {Virtualization for high-performance computing}, + author = {Mergen, Mark F. and Uhlig, Volkmar and Krieger, Orran and Xenidis, Jimi}, + year = 2006, + month = {Apr}, + journal = {ACM SIGOPS Operating Systems Review}, + volume = 40, + number = 2, + pages = {8–11}, + doi = {10.1145/1131322.1131328}, + issn = {0163-5980}, + abstractnote = {The specific demands of high-performance computing (HPC) often mismatch the assumptions and algorithms provided by legacy operating systems (OS) for common workload mixes. While feature- and application-rich OSes allow for flexible and low-cost hardware configurations, rapid development, and flexible testing and debugging, the mismatch comes at the cost of — oftentimes significant — performance degradation for HPC applications.} +} +@misc{7waysweh13:online, + title = {7 ways we harden our KVM hypervisor at Google Cloud: security in plaintext | Google Cloud Blog}, + author = {Andy Honig, Nelly Porter}, + year = 2017, + month = {01}, + note = {(Accessed on 10/02/2021)}, + howpublished = {\url{https://cloud.google.com/blog/products/gcp/7-ways-we-harden-our-kvm-hypervisor-at-google-cloud-security-in-plaintext}} +} +@inproceedings{kivity2007kvm, + title = {kvm: the Linux virtual machine monitor}, + author = {Kivity, Avi and Kamay, Yaniv and Laor, Dor and Lublin, Uri and Liguori, Anthony}, + year = 2007, + booktitle = {Proceedings of the Linux symposium}, + pages = {225--230}, + organization = {Dttawa, Dntorio, Canada} +} +@inproceedings{toka2021predicting, + title = {Predicting cloud-native application failures based on monitoring data of cloud infrastructure}, + author = {Toka, Laszlo and Dobreff, Gergely and Haja, David and Szalay, Mark}, + year = 2021, + booktitle = {2021 IFIP/IEEE International Symposium on Integrated Network Management (IM)}, + pages = {842--847}, + organization = {IEEE} +} +@inproceedings{li2019service, + title = {Service mesh: Challenges, state of the art, and future research opportunities}, + author = {Li, Wubin and Lemieux, Yves and Gao, Jing and Zhao, Zhuofeng and Han, Yanbo}, + year = 2019, + booktitle = {2019 IEEE International Conference on Service-Oriented System Engineering (SOSE)}, + pages = {122--1225}, + organization = {IEEE} +} +@misc{Benchmar93:online, + title = {Benchmarking Linkerd and Istio}, + author = {William Morgan}, + year = 2021, + month = {05}, + note = {(Accessed on 10/02/2021)}, + howpublished = {\url{https://linkerd.io/2021/05/27/linkerd-vs-istio-benchmarks/}} +} +@misc{Whatissi48:online, + title = {What is sidecar proxy?}, + author = {Alexander S. Gillis}, + year = 2019, + month = {01}, + note = {(Accessed on 10/02/2021)}, + howpublished = {\url{https://searchitoperations.techtarget.com/definition/sidecar-proxy}} +} +@misc{LKMLIngo52:online, + title = {LKML: Ingo Molnar: [GIT PULL] perf updates for v4.1}, + author = {Ingo Molnar}, + year = 2015, + month = 4, + note = {(Accessed on 10/02/2021)}, + howpublished = {\url{https://lkml.org/lkml/2015/4/14/232}} +} +@misc{WhatiseB46:online, + title = {What is eBPF?}, + author = {}, + year = {}, + month = {}, + note = {(Accessed on 10/07/2021)}, + howpublished = {\url{https://ebpf.io/what-is-ebpf}} +} +@article{batista2004study, + title = {A study of the behavior of several methods for balancing machine learning training data}, + author = {Batista, Gustavo EAPA and Prati, Ronaldo C and Monard, Maria Carolina}, + year = 2004, + journal = {ACM SIGKDD explorations newsletter}, + publisher = {ACM New York, NY, USA}, + volume = 6, + number = 1, + pages = {20--29} +} +@inproceedings{akcay2018ganomaly, + title = {Ganomaly: Semi-supervised anomaly detection via adversarial training}, + author = {Akcay, Samet and Atapour-Abarghouei, Amir and Breckon, Toby P}, + year = 2018, + booktitle = {Asian conference on computer vision}, + pages = {622--637}, + organization = {Springer} +} +@inproceedings{geethika2019anomaly, + title = {Anomaly Detection in High-Performance API Gateways}, + author = {Geethika, Deshani and Jayasinghe, Malith and Gunarathne, Yasas and Gamage, Thilina Ashen and Jayathilaka, Sudaraka and Ranathunga, Surangika and Perera, Srinath}, + year = 2019, + booktitle = {2019 International Conference on High Performance Computing \& Simulation (HPCS)}, + pages = {995--1001}, + organization = {IEEE} +} +@article{khoshnevisan2019rsm, + title = {Rsm-gan: A convolutional recurrent gan for anomaly detection in contaminated seasonal multivariate time series}, + author = {Khoshnevisan, Farzaneh and Fan, Zhewen}, + year = 2019, + journal = {arXiv preprint arXiv:1911.07104} +} +@inproceedings{nguyen2013fchain, + title = {Fchain: Toward black-box online fault localization for cloud systems}, + author = {Nguyen, Hiep and Shen, Zhiming and Tan, Yongmin and Gu, Xiaohui}, + year = 2013, + booktitle = {2013 IEEE 33rd International Conference on Distributed Computing Systems}, + pages = {21--30}, + organization = {IEEE} +} +@inproceedings{nguyen2011pal, +author = {Nguyen, Hiep and Tan, Yongmin and Gu, Xiaohui}, +title = {PAL: PRopagation-Aware ANomaly LOcalization for Cloud Hosted Distributed Applications}, +year = {2011}, +isbn = {9781450309783}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/2038633.2038634}, +doi = {10.1145/2038633.2038634}, +booktitle = {Managing Large-Scale Systems via the Analysis of System Logs and the Application of Machine Learning Techniques}, +articleno = {1}, +numpages = {8}, +keywords = {cloud computing, anomaly propagation, fault localization}, +location = {Cascais, Portugal}, +series = {SLAML '11} +} +@inproceedings{wang2020root, + title = {Root-cause metric location for microservice systems via log anomaly detection}, + author = {Wang, Lingzhi and Zhao, Nengwen and Chen, Junjie and Li, Pinnong and Zhang, Wenchi and Sui, Kaixin}, + year = 2020, + booktitle = {2020 IEEE International Conference on Web Services (ICWS)}, + pages = {142--150}, + organization = {IEEE} +} +@inproceedings{ma2020automap, + title = {Automap: Diagnose your microservice-based web applications automatically}, + author = {Ma, Meng and Xu, Jingmin and Wang, Yuan and Chen, Pengfei and Zhang, Zonghua and Wang, Ping}, + year = 2020, + booktitle = {Proceedings of The Web Conference 2020}, + pages = {246--258} +} +@inproceedings{meng2020localizing, + title = {Localizing failure root causes in a microservice through causality inference}, + author = {Meng, Yuan and Zhang, Shenglin and Sun, Yongqian and Zhang, Ruru and Hu, Zhilong and Zhang, Yiyin and Jia, Chenyang and Wang, Zhaogang and Pei, Dan}, + year = 2020, + booktitle = {2020 IEEE/ACM 28th International Symposium on Quality of Service (IWQoS)}, + pages = {1--10}, + organization = {IEEE} +} +% Research Gap + +% Research Challenges +@misc{Unsuperv29:online, + title = {Unsupervised Learning and Data Clustering}, + author = {Sanatan Mishra}, + year = 2017, + month = {05}, + note = {(Accessed on 10/08/2021)}, + howpublished = {\url{https://towardsdatascience.com/unsupervised-learning-and-data-clustering-eeecb78b422a}} +} +@article{silver2016mastering, + title = {Mastering the game of Go with deep neural networks and tree search}, + author = {Silver, David and Huang, Aja and Maddison, Chris J and Guez, Arthur and Sifre, Laurent and Van Den Driessche, George and Schrittwieser, Julian and Antonoglou, Ioannis and Panneershelvam, Veda and Lanctot, Marc and others}, + year = 2016, + journal = {nature}, + publisher = {Nature Publishing Group}, + volume = 529, + number = 7587, + pages = {484--489} +} +% Research methods + + +% Sort These +@inproceedings{chaczko2011availability, + title = {Availability and load balancing in cloud computing}, + author = {Chaczko, Zenon and Mahadevan, Venkatesh and Aslanzadeh, Shahrzad and Mcdermid, Christopher}, + year = 2011, + booktitle = {International Conference on Computer and Software Modeling, Singapore}, + volume = 14, + pages = {134--140}, + organization = {IACSIT Press} +} +@inproceedings{dua2014virtualization, + title = {Virtualization vs containerization to support paas}, + author = {Dua, Rajdeep and Raja, A Reddy and Kakadia, Dharmesh}, + year = 2014, + booktitle = {2014 IEEE International Conference on Cloud Engineering}, + pages = {610--614}, + organization = {IEEE} +} +@misc{Dockervs91:online, + title = {Docker vs Virtual Machines (VMs) : A Practical Guide to Docker Containers and VMs}, + author = {WeaveWorks}, + year = 2020, + month = {01}, + note = {(Accessed on 11/20/2021)}, + howpublished = {\url{https://www.weave.works/blog/a-practical-guide-to-choosing-between-docker-containers-and-vms}} +} +@misc{Kubernet59:online, + title = {Kubernetes adoption, security, and market trends report 2021}, + author = {RedHat}, + year = 2021, + month = {07}, + note = {(Accessed on 11/20/2021)}, + howpublished = {\url{https://www.redhat.com/en/resources/kubernetes-adoption-security-market-trends-2021-overview}} +} +@article{ElasticityCloudComputing, + title = {Elasticity in Cloud Computing: State of the Art and Research Challenges}, + author = {al-dhuraibi, Yahya and Fawaz, Paraiso and Djarallah, Nabil and Merle, Philippe}, + year = 2017, + month = {06}, + journal = {IEEE Transactions on Services Computing}, + volume = {PP}, + pages = {1--1}, + doi = {10.1109/TSC.2017.2711009} +} + + + + + + + +%% SRS + + +@misc{CloudNat36:online, +author = {CNCF}, +title = {Cloud Native Survey 2020: Containers in production jump 300\% from our first survey}, +howpublished = {\url{https://www.cncf.io/blog/2020/11/17/cloud-native-survey-2020-containers-in-production-jump-300-from-our-first-survey/}}, +month = {11}, +year = {2020}, +note = {(Accessed on 02/02/2022)} +} + +@misc{Introduc93:online, +author = {}, +title = {The Kubebuilder Book}, +howpublished = {\url{https://book.kubebuilder.io}}, +month = {}, +year = {}, +note = {(Accessed on 02/03/2022)} +} + + +@misc{Microsof81:online, +author = {Microsoft}, +title = {Microsoft 365 Status on Twitter: "We're investigating an issue for access to multiple M365 services. Please visit the admin center post M0244568 for more information. We'll provide additional information here as it becomes available."}, +howpublished = {\url{https://twitter.com/MSFT365Status/status/1371546946263916545}}, +month = {03}, +year = {2021}, +note = {(Accessed on 03/02/2022)} +} + + + + +% Implementation + +@misc{WhatisaT74:online, +author = {AfterAcademy}, +title = {What is a TCP 3-way handshake process?}, +howpublished = {\url{https://afteracademy.com/blog/what-is-a-tcp-3-way-handshake-process}}, +month = {02}, +year = {2020}, +note = {(Accessed on 03/02/2022)} +} + +@article{sola1997importance, + title={Importance of input data normalization for the application of neural networks to complex industrial problems}, + author={Sola, Jorge and Sevilla, Joaquin}, + journal={IEEE Transactions on nuclear science}, + volume={44}, + number={3}, + pages={1464--1468}, + year={1997}, + publisher={IEEE} +} +@book{hausenblas2019programming, + title={Programming Kubernetes: Developing Cloud-Native Applications}, + author={Hausenblas, M. and Schimanski, S.}, + isbn={9781492047070}, + url={https://books.google.lk/books?id=7VKjDwAAQBAJ}, + year={2019}, + publisher={O'Reilly Media} +} diff --git a/proposal/acronym.tex b/documentation/proposal/acronym.tex similarity index 100% rename from proposal/acronym.tex rename to documentation/proposal/acronym.tex diff --git a/documentation/proposal/assets/High-level-system-diagram.png b/documentation/proposal/assets/High-level-system-diagram.png new file mode 100644 index 0000000..553993d Binary files /dev/null and b/documentation/proposal/assets/High-level-system-diagram.png differ diff --git a/documentation/proposal/assets/IIT-Logo.png b/documentation/proposal/assets/IIT-Logo.png new file mode 100644 index 0000000..4a12ee1 Binary files /dev/null and b/documentation/proposal/assets/IIT-Logo.png differ diff --git a/documentation/proposal/assets/gantt-chart.jpg b/documentation/proposal/assets/gantt-chart.jpg new file mode 100644 index 0000000..3d627ff Binary files /dev/null and b/documentation/proposal/assets/gantt-chart.jpg differ diff --git a/proposal/cover-page.tex b/documentation/proposal/cover-page.tex similarity index 100% rename from proposal/cover-page.tex rename to documentation/proposal/cover-page.tex diff --git a/proposal/main.tex b/documentation/proposal/main.tex similarity index 100% rename from proposal/main.tex rename to documentation/proposal/main.tex diff --git a/proposal/preamble.tex b/documentation/proposal/preamble.tex similarity index 100% rename from proposal/preamble.tex rename to documentation/proposal/preamble.tex diff --git a/proposal/project-proposal.pdf b/documentation/proposal/project-proposal.pdf similarity index 100% rename from proposal/project-proposal.pdf rename to documentation/proposal/project-proposal.pdf diff --git a/proposal/references.bib b/documentation/proposal/references.bib similarity index 100% rename from proposal/references.bib rename to documentation/proposal/references.bib diff --git a/proposal/sections/appendix.tex b/documentation/proposal/sections/appendix.tex similarity index 100% rename from proposal/sections/appendix.tex rename to documentation/proposal/sections/appendix.tex diff --git a/proposal/sections/background.tex b/documentation/proposal/sections/background.tex similarity index 100% rename from proposal/sections/background.tex rename to documentation/proposal/sections/background.tex diff --git a/proposal/sections/introduction.tex b/documentation/proposal/sections/introduction.tex similarity index 100% rename from proposal/sections/introduction.tex rename to documentation/proposal/sections/introduction.tex diff --git a/proposal/sections/methodology/development.tex b/documentation/proposal/sections/methodology/development.tex similarity index 100% rename from proposal/sections/methodology/development.tex rename to documentation/proposal/sections/methodology/development.tex diff --git a/proposal/sections/methodology/project-management.tex b/documentation/proposal/sections/methodology/project-management.tex similarity index 100% rename from proposal/sections/methodology/project-management.tex rename to documentation/proposal/sections/methodology/project-management.tex diff --git a/proposal/sections/methodology/research.tex b/documentation/proposal/sections/methodology/research.tex similarity index 100% rename from proposal/sections/methodology/research.tex rename to documentation/proposal/sections/methodology/research.tex diff --git a/proposal/sections/problem.tex b/documentation/proposal/sections/problem.tex similarity index 100% rename from proposal/sections/problem.tex rename to documentation/proposal/sections/problem.tex diff --git a/proposal/sections/project-scope.tex b/documentation/proposal/sections/project-scope.tex similarity index 100% rename from proposal/sections/project-scope.tex rename to documentation/proposal/sections/project-scope.tex diff --git a/proposal/sections/related-work.tex b/documentation/proposal/sections/related-work.tex similarity index 100% rename from proposal/sections/related-work.tex rename to documentation/proposal/sections/related-work.tex diff --git a/proposal/sections/research/aim.tex b/documentation/proposal/sections/research/aim.tex similarity index 100% rename from proposal/sections/research/aim.tex rename to documentation/proposal/sections/research/aim.tex diff --git a/proposal/sections/research/challenge.tex b/documentation/proposal/sections/research/challenge.tex similarity index 100% rename from proposal/sections/research/challenge.tex rename to documentation/proposal/sections/research/challenge.tex diff --git a/proposal/sections/research/contribution.tex b/documentation/proposal/sections/research/contribution.tex similarity index 100% rename from proposal/sections/research/contribution.tex rename to documentation/proposal/sections/research/contribution.tex diff --git a/proposal/sections/research/gap.tex b/documentation/proposal/sections/research/gap.tex similarity index 100% rename from proposal/sections/research/gap.tex rename to documentation/proposal/sections/research/gap.tex diff --git a/proposal/sections/research/motivation.tex b/documentation/proposal/sections/research/motivation.tex similarity index 100% rename from proposal/sections/research/motivation.tex rename to documentation/proposal/sections/research/motivation.tex diff --git a/proposal/sections/research/objective.tex b/documentation/proposal/sections/research/objective.tex similarity index 100% rename from proposal/sections/research/objective.tex rename to documentation/proposal/sections/research/objective.tex diff --git a/proposal/sections/research/question.tex b/documentation/proposal/sections/research/question.tex similarity index 100% rename from proposal/sections/research/question.tex rename to documentation/proposal/sections/research/question.tex diff --git a/proposal/sections/research/sections/appendix.tex b/documentation/proposal/sections/research/sections/appendix.tex similarity index 100% rename from proposal/sections/research/sections/appendix.tex rename to documentation/proposal/sections/research/sections/appendix.tex diff --git a/proposal/sections/research/sections/background.tex b/documentation/proposal/sections/research/sections/background.tex similarity index 100% rename from proposal/sections/research/sections/background.tex rename to documentation/proposal/sections/research/sections/background.tex diff --git a/proposal/sections/research/sections/introduction.tex b/documentation/proposal/sections/research/sections/introduction.tex similarity index 100% rename from proposal/sections/research/sections/introduction.tex rename to documentation/proposal/sections/research/sections/introduction.tex diff --git a/proposal/sections/research/sections/methodology/development.tex b/documentation/proposal/sections/research/sections/methodology/development.tex similarity index 100% rename from proposal/sections/research/sections/methodology/development.tex rename to documentation/proposal/sections/research/sections/methodology/development.tex diff --git a/proposal/sections/research/sections/methodology/project-management.tex b/documentation/proposal/sections/research/sections/methodology/project-management.tex similarity index 100% rename from proposal/sections/research/sections/methodology/project-management.tex rename to documentation/proposal/sections/research/sections/methodology/project-management.tex diff --git a/proposal/sections/research/sections/methodology/research.tex b/documentation/proposal/sections/research/sections/methodology/research.tex similarity index 100% rename from proposal/sections/research/sections/methodology/research.tex rename to documentation/proposal/sections/research/sections/methodology/research.tex diff --git a/proposal/sections/research/sections/problem.tex b/documentation/proposal/sections/research/sections/problem.tex similarity index 100% rename from proposal/sections/research/sections/problem.tex rename to documentation/proposal/sections/research/sections/problem.tex diff --git a/proposal/sections/research/sections/project-scope.tex b/documentation/proposal/sections/research/sections/project-scope.tex similarity index 100% rename from proposal/sections/research/sections/project-scope.tex rename to documentation/proposal/sections/research/sections/project-scope.tex diff --git a/proposal/sections/research/sections/related-work.tex b/documentation/proposal/sections/research/sections/related-work.tex similarity index 100% rename from proposal/sections/research/sections/related-work.tex rename to documentation/proposal/sections/research/sections/related-work.tex diff --git a/proposal/sections/research/sections/research/aim.tex b/documentation/proposal/sections/research/sections/research/aim.tex similarity index 100% rename from proposal/sections/research/sections/research/aim.tex rename to documentation/proposal/sections/research/sections/research/aim.tex diff --git a/proposal/sections/research/sections/research/challenge.tex b/documentation/proposal/sections/research/sections/research/challenge.tex similarity index 100% rename from proposal/sections/research/sections/research/challenge.tex rename to documentation/proposal/sections/research/sections/research/challenge.tex diff --git a/proposal/sections/research/sections/research/contribution.tex b/documentation/proposal/sections/research/sections/research/contribution.tex similarity index 100% rename from proposal/sections/research/sections/research/contribution.tex rename to documentation/proposal/sections/research/sections/research/contribution.tex diff --git a/proposal/sections/research/sections/research/gap.tex b/documentation/proposal/sections/research/sections/research/gap.tex similarity index 100% rename from proposal/sections/research/sections/research/gap.tex rename to documentation/proposal/sections/research/sections/research/gap.tex diff --git a/proposal/sections/research/sections/research/motivation.tex b/documentation/proposal/sections/research/sections/research/motivation.tex similarity index 100% rename from proposal/sections/research/sections/research/motivation.tex rename to documentation/proposal/sections/research/sections/research/motivation.tex diff --git a/proposal/sections/research/sections/research/objective.tex b/documentation/proposal/sections/research/sections/research/objective.tex similarity index 100% rename from proposal/sections/research/sections/research/objective.tex rename to documentation/proposal/sections/research/sections/research/objective.tex diff --git a/proposal/sections/research/sections/research/question.tex b/documentation/proposal/sections/research/sections/research/question.tex similarity index 100% rename from proposal/sections/research/sections/research/question.tex rename to documentation/proposal/sections/research/sections/research/question.tex diff --git a/gazer/sock_state.c b/gazer/sock_state.c index 2633d6e..25542e9 100644 --- a/gazer/sock_state.c +++ b/gazer/sock_state.c @@ -1,3 +1,4 @@ +// Adopted from https://github.com/iovisor/bcc/blob/master/tools/tcplife.py TRACEPOINT_PROBE(sock, inet_sock_set_state) { if (args->protocol != IPPROTO_TCP) diff --git a/sherlock/scraper.py b/sherlock/scraper.py index ab4f917..f1fe61c 100644 --- a/sherlock/scraper.py +++ b/sherlock/scraper.py @@ -11,7 +11,7 @@ {'name': 'cpu_usage', "query": 'avg_over_time(cpu_seconds[1m])'}, {'name': 'memory_usage', "query": 'avg_over_time(memory_usage_bytes[1m])'}, {'name': 'acknowledged_bytes_per_minute', "query": 'rate(acknowledged_bytes_sum[1m])'}, - {'name': 'acknowledged_bytes_per_minute', "query": 'rate(transmitted_bytes_sum[1m])'}, + {'name': 'transmitted_bytes_per_minute', "query": 'rate(transmitted_bytes_sum[1m])'}, {'name': 'syn_backlog_per_minute', "query": 'avg_over_time(backlog{level="1"}[1m])'}, {'name': 'high_syn_backlog_per_minute', "query": 'sum by (serviceName) (avg_over_time(backlog{level!="1"}[1m]))'}, ] @@ -19,14 +19,14 @@ for query in tqdm(queries): - start_time = parse_datetime("1d") - end_time = parse_datetime("now") + start_time = parse_datetime("41h") + end_time = parse_datetime("20h") metric_data = prom.custom_query_range( query['query'], # this is the metric name and label config start_time=start_time, end_time=end_time, - step="14", + step="15", ) metric_df = MetricRangeDataFrame(metric_data, columns=['timestamp', 'serviceName', 'value'])