diff --git a/.gitignore b/.gitignore index 71e109b..8a425f4 100644 --- a/.gitignore +++ b/.gitignore @@ -3,9 +3,6 @@ # Project Proposal -proposal/* -!proposal/*.tex -!proposal/*.bib !proposal/project-proposal.pdf diff --git a/proposal/.gitignore b/proposal/.gitignore new file mode 100644 index 0000000..48bfdcd --- /dev/null +++ b/proposal/.gitignore @@ -0,0 +1,302 @@ + +# Created by https://www.toptal.com/developers/gitignore/api/tex +# Edit at https://www.toptal.com/developers/gitignore?templates=tex + +### TeX ### +## Core latex/pdflatex auxiliary files: +*.aux +*.lof +*.log +*.lot +*.fls +*.out +*.toc +*.fmt +*.fot +*.cb +*.cb2 +.*.lb + +## Intermediate documents: +*.dvi +*.xdv +*-converted-to.* +# these rules might exclude image files for figures etc. +# *.ps +# *.eps +# *.pdf + +## Generated if empty string is given at "Please type another file name for output:" +*.pdf + +## Bibliography auxiliary files (bibtex/biblatex/biber): +*.bbl +*.bcf +*.blg +*-blx.aux +*-blx.bib +*.run.xml + +## Build tool auxiliary files: +*.fdb_latexmk +*.synctex +*.synctex(busy) +*.synctex.gz +*.synctex.gz(busy) +*.pdfsync + +## Build tool directories for auxiliary files +# latexrun +latex.out/ + +## Auxiliary and intermediate files from other packages: +# algorithms +*.alg +*.loa + +# achemso +acs-*.bib + +# amsthm +*.thm + +# beamer +*.nav +*.pre +*.snm +*.vrb + +# changes +*.soc + +# comment +*.cut + +# cprotect +*.cpt + +# elsarticle (documentclass of Elsevier journals) +*.spl + +# endnotes +*.ent + +# fixme +*.lox + +# feynmf/feynmp +*.mf +*.mp +*.t[1-9] +*.t[1-9][0-9] +*.tfm + +#(r)(e)ledmac/(r)(e)ledpar +*.end +*.?end +*.[1-9] +*.[1-9][0-9] +*.[1-9][0-9][0-9] +*.[1-9]R +*.[1-9][0-9]R +*.[1-9][0-9][0-9]R +*.eledsec[1-9] +*.eledsec[1-9]R +*.eledsec[1-9][0-9] +*.eledsec[1-9][0-9]R +*.eledsec[1-9][0-9][0-9] +*.eledsec[1-9][0-9][0-9]R + +# glossaries +*.acn +*.acr +*.glg +*.glo +*.gls +*.glsdefs +*.lzo +*.lzs + +# uncomment this for glossaries-extra (will ignore makeindex's style files!) +# *.ist + +# gnuplottex +*-gnuplottex-* + +# gregoriotex +*.gaux +*.glog +*.gtex + +# htlatex +*.4ct +*.4tc +*.idv +*.lg +*.trc +*.xref + +# hyperref +*.brf + +# knitr +*-concordance.tex +# TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files +# *.tikz +*-tikzDictionary + +# listings +*.lol + +# luatexja-ruby +*.ltjruby + +# makeidx +*.idx +*.ilg +*.ind + +# minitoc +*.maf +*.mlf +*.mlt +*.mtc[0-9]* +*.slf[0-9]* +*.slt[0-9]* +*.stc[0-9]* + +# minted +_minted* +*.pyg + +# morewrites +*.mw + +# newpax +*.newpax + +# nomencl +*.nlg +*.nlo +*.nls + +# pax +*.pax + +# pdfpcnotes +*.pdfpc + +# sagetex +*.sagetex.sage +*.sagetex.py +*.sagetex.scmd + +# scrwfile +*.wrt + +# sympy +*.sout +*.sympy +sympy-plots-for-*.tex/ + +# pdfcomment +*.upa +*.upb + +# pythontex +*.pytxcode +pythontex-files-*/ + +# tcolorbox +*.listing + +# thmtools +*.loe + +# TikZ & PGF +*.dpth +*.md5 +*.auxlock + +# todonotes +*.tdo + +# vhistory +*.hst +*.ver + +# easy-todo +*.lod + +# xcolor +*.xcp + +# xmpincl +*.xmpi + +# xindy +*.xdy + +# xypic precompiled matrices and outlines +*.xyc +*.xyd + +# endfloat +*.ttt +*.fff + +# Latexian +TSWLatexianTemp* + +## Editors: +# WinEdt +*.bak +*.sav + +# Texpad +.texpadtmp + +# LyX +*.lyx~ + +# Kile +*.backup + +# gummi +.*.swp + +# KBibTeX +*~[0-9]* + +# TeXnicCenter +*.tps + +# auto folder when using emacs and auctex +./auto/* +*.el + +# expex forward references with \gathertags +*-tags.tex + +# standalone packages +*.sta + +# Makeindex log files +*.lpz + +# xwatermark package +*.xwm + +# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib +# option is specified. Footnotes are the stored in a file with suffix Notes.bib. +# Uncomment the next line to have this generated file ignored. +#*Notes.bib + +### TeX Patch ### +# LIPIcs / OASIcs +*.vtc + +# glossaries +*.glstex + +# End of https://www.toptal.com/developers/gitignore/api/tex \ No newline at end of file diff --git a/proposal/acronym.tex b/proposal/acronym.tex index 962467e..d69fb82 100644 --- a/proposal/acronym.tex +++ b/proposal/acronym.tex @@ -2,7 +2,6 @@ \chapter*{List of Acronyms} \begin{acronym} \acro{iaas}[IaaS]{Infrastructure as a Service} -% \acro{saas}[SaaS]{Software as a service} \acro{sres}[SRE]{Site Reliability Engineer} \acro{sli}[SLI]{Service Level Indicator} \acro{apm}[APM]{Application Performance Monitoring} @@ -12,7 +11,5 @@ \chapter*{List of Acronyms} \acro{fsl}[FSL]{Few-shot Learning} \acro{sdlc}[SDLC]{Software Development Life Cycle} \acro{ooad}[OOAD]{Object-oriented analysis and design} -% \acro{vm}[VM]{Virtual Machine} -% \acro{cncf}[CNCF]{Cloud Native Computing Foundation} \acro{ebpf}[eBPF]{Extended Berkeley Packet Filter} \end{acronym} \ No newline at end of file diff --git a/proposal/assets/High-level-system-diagram.png b/proposal/assets/High-level-system-diagram.png new file mode 100644 index 0000000..553993d Binary files /dev/null and b/proposal/assets/High-level-system-diagram.png differ diff --git a/proposal/assets/IIT-Logo.png b/proposal/assets/IIT-Logo.png new file mode 100644 index 0000000..4a12ee1 Binary files /dev/null and b/proposal/assets/IIT-Logo.png differ diff --git a/proposal/assets/gantt-chart.jpg b/proposal/assets/gantt-chart.jpg new file mode 100644 index 0000000..3d627ff Binary files /dev/null and b/proposal/assets/gantt-chart.jpg differ diff --git a/proposal/preamble.tex b/proposal/preamble.tex index 45d73fd..11b8d3b 100644 --- a/proposal/preamble.tex +++ b/proposal/preamble.tex @@ -85,4 +85,4 @@ % \titlespacing*{\subsection}{0pt}{15pt}{10pt} \titleformat{\subsubsection}[hang]{\fontsize{12pt}{0}\itshape}{\thesubsubsection}{1em}{} -% \titlespacing*{\subsubsection}{0pt}{15pt}{10pt} +% \titlespacing*{\subsubsection}{0pt}{15pt}{10pt} \ No newline at end of file diff --git a/proposal/references.bib b/proposal/references.bib index f334d57..1a95782 100644 --- a/proposal/references.bib +++ b/proposal/references.bib @@ -1,20 +1,4 @@ % Introduction -@article{dragoni2017microservices, - title = {Microservices: yesterday, today, and tomorrow}, - author = {Dragoni, Nicola and Giallorenzo, Saverio and Lafuente, Alberto Lluch and Mazzara, Manuel and Montesi, Fabrizio and Mustafin, Ruslan and Safina, Larisa}, - year = 2017, - journal = {Present and ulterior software engineering}, - publisher = {Springer}, - pages = {195--216} -} -@misc{Introduc54:online, - title = {Introducing Domain-Oriented Microservice Architecture | Uber Engineering Blog}, - author = {Adam Gluck}, - year = 2020, - month = 7, - note = {(Accessed on 08/26/2021)}, - howpublished = {\url{https://eng.uber.com/microservice-architecture/}} -} % Background @misc{rimol_2021, title = {Gartner Says Worldwide IaaS Public Cloud Services Market Grew 40.7\% in 2020}, @@ -73,22 +57,7 @@ @misc{5WaysYou35:online howpublished = {\url{https://www.overops.com/blog/5-ways-to-not-f-up-your-microservices-in-production/}} } % Problem -@misc{CloudAdo16:online, - title = {Cloud Adoption Statistics - It's Everywhere \& Everyone's Using It in 2021!}, - author = {Nick Galov}, - year = 2021, - month = 9, - note = {(Accessed on 08/27/2021)}, - howpublished = {\url{https://hostingtribunal.com/blog/cloud-adoption-statistics/}} -} -@misc{Datadog18:online, - title = {Datadog Product Tour}, - author = {}, - year = 2020, - month = 9, - note = {(Accessed on 08/26/2021)}, - howpublished = {\url{https://www.youtube.com/watch?v=YmJcbAI\_OCg}} -} + % Research Motivation @misc{Untangli35:online, title = {Untangling Microservices or Balancing Complexity in Distributed Systems}, @@ -115,14 +84,6 @@ @article{silver2017mastering pages = {354--359} } % Related Works -@misc{Watchdog76:online, - title = {Watchdog: Auto-Detect Performance Anomalies Without Setting Alerts | Datadog}, - author = {Brad Menezes}, - year = 2018, - month = 7, - note = {(Accessed on 08/29/2021)}, - howpublished = {\url{https://www.datadoghq.com/blog/watchdog/}} -} @inproceedings{du2018anomaly, title = {Anomaly detection and diagnosis for container-based microservices with performance monitoring}, author = {Du, Qingfeng and Xie, Tiandi and He, Yu}, @@ -152,14 +113,6 @@ @article{oord2016wavenet year = 2016, journal = {arXiv preprint arXiv:1609.03499} } -@inproceedings{kim2018encoding, - title = {An encoding technique for CNN-based network anomaly detection}, - author = {Kim, Taejoon and Suh, Sang C and Kim, Hyunjoo and Kim, Jonghyun and Kim, Jinoh}, - year = 2018, - booktitle = {2018 IEEE International Conference on Big Data (Big Data)}, - pages = {2960--2965}, - organization = {IEEE} -} @inproceedings{chigurupati2017root, title = {Root cause analysis using artificial intelligence}, author = {Chigurupati, Asha and Lassar, Noah}, @@ -212,226 +165,9 @@ @inproceedings{zhang2019deep volume = 33, pages = {1409--1416} } -@article{soldani2021anomaly, - title = {Anomaly Detection and Failure Root Cause Analysis in (Micro) Service-Based Cloud Applications: A Survey}, - author = {Soldani, Jacopo and Brogi, Antonio}, - year = 2021, - journal = {arXiv preprint arXiv:2105.12378} -} -@inproceedings{hagemann2020systematic, - title = {A Systematic Review on Anomaly Detection for Cloud Computing Environments}, - author = {Hagemann, Tanja and Katsarou, Katerina}, - year = 2020, - booktitle = {2020 3rd Artificial Intelligence and Cloud Computing Conference}, - pages = {83--96} -} -@article{hinton2006reducing, - title = {Reducing the dimensionality of data with neural networks}, - author = {Hinton, Geoffrey E and Salakhutdinov, Ruslan R}, - year = 2006, - journal = {science}, - publisher = {American Association for the Advancement of Science}, - volume = 313, - number = 5786, - pages = {504--507} -} -@article{goodfellow2014generative, - title = {Generative adversarial nets}, - author = {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua}, - year = 2014, - journal = {Advances in neural information processing systems}, - volume = 27 -} -@article{donahue2016adversarial, - title = {Adversarial feature learning}, - author = {Donahue, Jeff and Kr{\"a}henb{\"u}hl, Philipp and Darrell, Trevor}, - year = 2016, - journal = {arXiv preprint arXiv:1605.09782} -} -@article{buckland1994relationship, - title = {The relationship between recall and precision}, - author = {Buckland, Michael and Gey, Fredric}, - year = 1994, - journal = {Journal of the American society for information science}, - publisher = {Wiley Online Library}, - volume = 45, - number = 1, - pages = {12--19} -} -@misc{Accuracy18:online, - title = {Accuracy vs. F1-Score}, - author = {urva Huilgol}, - year = 2019, - month = {08}, - note = {(Accessed on 10/20/2021)}, - howpublished = {\url{https://medium.com/analytics-vidhya/accuracy-vs-f1-score-6258237beca2}} -} % Approach -@misc{WhatisaS27:online, - title = {What is a System Monitor?}, - author = {G. Wiesen}, - year = {}, - month = {}, - note = {(Accessed on 10/02/2021)}, - howpublished = {\url{https://www.easytechjunkie.com/what-is-a-system-monitor.htm}} -} -@misc{Whatisvi12:online, - title = {What is virtualization?}, - author = {}, - year = {}, - month = {}, - note = {(Accessed on 10/02/2021)}, - howpublished = {\url{https://www.redhat.com/en/topics/virtualization/what-is-virtualization#history-of-virtualization}} -} -@article{Mergen_Uhlig_Krieger_Xenidis_2006, - title = {Virtualization for high-performance computing}, - author = {Mergen, Mark F. and Uhlig, Volkmar and Krieger, Orran and Xenidis, Jimi}, - year = 2006, - month = {Apr}, - journal = {ACM SIGOPS Operating Systems Review}, - volume = 40, - number = 2, - pages = {8–11}, - doi = {10.1145/1131322.1131328}, - issn = {0163-5980}, - abstractnote = {The specific demands of high-performance computing (HPC) often mismatch the assumptions and algorithms provided by legacy operating systems (OS) for common workload mixes. While feature- and application-rich OSes allow for flexible and low-cost hardware configurations, rapid development, and flexible testing and debugging, the mismatch comes at the cost of — oftentimes significant — performance degradation for HPC applications.} -} -@misc{7waysweh13:online, - title = {7 ways we harden our KVM hypervisor at Google Cloud: security in plaintext | Google Cloud Blog}, - author = {Andy Honig, Nelly Porter}, - year = 2017, - month = {01}, - note = {(Accessed on 10/02/2021)}, - howpublished = {\url{https://cloud.google.com/blog/products/gcp/7-ways-we-harden-our-kvm-hypervisor-at-google-cloud-security-in-plaintext}} -} -@inproceedings{kivity2007kvm, - title = {kvm: the Linux virtual machine monitor}, - author = {Kivity, Avi and Kamay, Yaniv and Laor, Dor and Lublin, Uri and Liguori, Anthony}, - year = 2007, - booktitle = {Proceedings of the Linux symposium}, - volume = 1, - number = 8, - pages = {225--230}, - organization = {Dttawa, Dntorio, Canada} -} -@inproceedings{toka2021predicting, - title = {Predicting cloud-native application failures based on monitoring data of cloud infrastructure}, - author = {Toka, Laszlo and Dobreff, Gergely and Haja, David and Szalay, Mark}, - year = 2021, - booktitle = {2021 IFIP/IEEE International Symposium on Integrated Network Management (IM)}, - pages = {842--847}, - organization = {IEEE} -} -@inproceedings{li2019service, - title = {Service mesh: Challenges, state of the art, and future research opportunities}, - author = {Li, Wubin and Lemieux, Yves and Gao, Jing and Zhao, Zhuofeng and Han, Yanbo}, - year = 2019, - booktitle = {2019 IEEE International Conference on Service-Oriented System Engineering (SOSE)}, - pages = {122--1225}, - organization = {IEEE} -} -@misc{Benchmar93:online, - title = {Benchmarking Linkerd and Istio}, - author = {William Morgan}, - year = 2021, - month = {05}, - note = {(Accessed on 10/02/2021)}, - howpublished = {\url{https://linkerd.io/2021/05/27/linkerd-vs-istio-benchmarks/}} -} -@misc{Whatissi48:online, - title = {What is sidecar proxy?}, - author = {Alexander S. Gillis}, - year = 2019, - month = {01}, - note = {(Accessed on 10/02/2021)}, - howpublished = {\url{https://searchitoperations.techtarget.com/definition/sidecar-proxy}} -} -@misc{LKMLIngo52:online, - title = {LKML: Ingo Molnar: [GIT PULL] perf updates for v4.1}, - author = {Ingo Molnar}, - year = 2015, - month = 4, - note = {(Accessed on 10/02/2021)}, - howpublished = {\url{https://lkml.org/lkml/2015/4/14/232}} -} -@misc{WhatiseB46:online, - title = {What is eBPF?}, - author = {}, - year = {}, - month = {}, - note = {(Accessed on 10/07/2021)}, - howpublished = {\url{https://ebpf.io/what-is-ebpf}} -} -@article{batista2004study, - title = {A study of the behavior of several methods for balancing machine learning training data}, - author = {Batista, Gustavo EAPA and Prati, Ronaldo C and Monard, Maria Carolina}, - year = 2004, - journal = {ACM SIGKDD explorations newsletter}, - publisher = {ACM New York, NY, USA}, - volume = 6, - number = 1, - pages = {20--29} -} -@inproceedings{akcay2018ganomaly, - title = {Ganomaly: Semi-supervised anomaly detection via adversarial training}, - author = {Akcay, Samet and Atapour-Abarghouei, Amir and Breckon, Toby P}, - year = 2018, - booktitle = {Asian conference on computer vision}, - pages = {622--637}, - organization = {Springer} -} -@inproceedings{geethika2019anomaly, - title = {Anomaly Detection in High-Performance API Gateways}, - author = {Geethika, Deshani and Jayasinghe, Malith and Gunarathne, Yasas and Gamage, Thilina Ashen and Jayathilaka, Sudaraka and Ranathunga, Surangika and Perera, Srinath}, - year = 2019, - booktitle = {2019 International Conference on High Performance Computing \& Simulation (HPCS)}, - pages = {995--1001}, - organization = {IEEE} -} -@article{khoshnevisan2019rsm, - title = {Rsm-gan: A convolutional recurrent gan for anomaly detection in contaminated seasonal multivariate time series}, - author = {Khoshnevisan, Farzaneh and Fan, Zhewen}, - year = 2019, - journal = {arXiv preprint arXiv:1911.07104} -} -@inproceedings{nguyen2013fchain, - title = {Fchain: Toward black-box online fault localization for cloud systems}, - author = {Nguyen, Hiep and Shen, Zhiming and Tan, Yongmin and Gu, Xiaohui}, - year = 2013, - booktitle = {2013 IEEE 33rd International Conference on Distributed Computing Systems}, - pages = {21--30}, - organization = {IEEE} -} -@incollection{nguyen2011pal, - title = {Pal: P ropagation-aware a nomaly localization for cloud hosted distributed applications}, - author = {Nguyen, Hiep and Tan, Yongmin and Gu, Xiaohui}, - year = 2011, - booktitle = {Managing Large-scale Systems via the Analysis of System Logs and the Application of Machine Learning Techniques}, - pages = {1--8} -} -@inproceedings{wang2020root, - title = {Root-cause metric location for microservice systems via log anomaly detection}, - author = {Wang, Lingzhi and Zhao, Nengwen and Chen, Junjie and Li, Pinnong and Zhang, Wenchi and Sui, Kaixin}, - year = 2020, - booktitle = {2020 IEEE International Conference on Web Services (ICWS)}, - pages = {142--150}, - organization = {IEEE} -} -@inproceedings{ma2020automap, - title = {Automap: Diagnose your microservice-based web applications automatically}, - author = {Ma, Meng and Xu, Jingmin and Wang, Yuan and Chen, Pengfei and Zhang, Zonghua and Wang, Ping}, - year = 2020, - booktitle = {Proceedings of The Web Conference 2020}, - pages = {246--258} -} -@inproceedings{meng2020localizing, - title = {Localizing failure root causes in a microservice through causality inference}, - author = {Meng, Yuan and Zhang, Shenglin and Sun, Yongqian and Zhang, Ruru and Hu, Zhilong and Zhang, Yiyin and Jia, Chenyang and Wang, Zhaogang and Pei, Dan}, - year = 2020, - booktitle = {2020 IEEE/ACM 28th International Symposium on Quality of Service (IWQoS)}, - pages = {1--10}, - organization = {IEEE} -} + + % Research Gap @misc{GoingtoM51:online, title = {Going to Market Faster: Most Companies Are Deploying Code Weekly, Daily, or Hourly}, @@ -465,24 +201,6 @@ @misc{Googlead4:online note = {(Accessed on 09/14/2021)}, howpublished = {\url{https://www.theregister.com/2021/02/25/google_kubernetes_autopilot/}} } -@misc{Unsuperv29:online, - title = {Unsupervised Learning and Data Clustering}, - author = {Sanatan Mishra}, - year = 2017, - month = {05}, - note = {(Accessed on 10/08/2021)}, - howpublished = {\url{https://towardsdatascience.com/unsupervised-learning-and-data-clustering-eeecb78b422a}} -} -@article{silver2016mastering, - title = {Mastering the game of Go with deep neural networks and tree search}, - author = {Silver, David and Huang, Aja and Maddison, Chris J and Guez, Arthur and Sifre, Laurent and Van Den Driessche, George and Schrittwieser, Julian and Antonoglou, Ioannis and Panneershelvam, Veda and Lanctot, Marc and others}, - year = 2016, - journal = {nature}, - publisher = {Nature Publishing Group}, - volume = 529, - number = 7587, - pages = {484--489} -} % Research methods @misc{1Philoso75:online, title = {Philosophy of Science | Four Major Paradigms}, diff --git a/proposal/sections/appendix.tex b/proposal/sections/appendix.tex new file mode 100644 index 0000000..e400638 --- /dev/null +++ b/proposal/sections/appendix.tex @@ -0,0 +1,5 @@ +\begin{figure}[!ht] + \chapter{Gantt Chart} \label{appendix:gantt-chart} + \centering + \includegraphics[height=22cm]{assets/gantt-chart.png} +\end{figure} \ No newline at end of file diff --git a/proposal/sections/background.tex b/proposal/sections/background.tex new file mode 100644 index 0000000..4647280 --- /dev/null +++ b/proposal/sections/background.tex @@ -0,0 +1,19 @@ + + +{\let\clearpage\relax \chapter{Problem Background}} +% {\let\clearpage\relax \chapter{Problem Domain}} + +\section{Cloud Computing} +With an emergence \ac{iaas} like Amazon Web Services (AWS) and Google Cloud Platform (GCP) there is a big surge in organizations trying to outsource their computing needs to third parties \citep{rimol_2021}. This is mainly due to the elasticity given by all the cloud providers. Users can easily scale up and down their infrastructures within minutes without making any commitment and all the major providers, bill users on what you use are what you pay model because cloud provider manages all the underlying infrastructure users doesn't have to worry about problems like hardware failures. In contrast in a self-hosted setting if the user wanted one extra GB of memory than what's available it requires a lot of effort and cost to full fill that requirement. + +\section{Cloud-Native Applications} +During the 90s and early 2000s, all the applications were made as a big monolith from a single code base \citep{LessonsF52:online}. Most of them were shipped as a single binary. Since those days applications were fairly simple this worked very well with little to no downsides. But when the 2010s came around there were a lot of specialized frameworks and programming languages and marketing teams wanted a lot of new futures quickly developed still maintaining reliability \citep{di2018migrating,Microser52:online}. But if the code base of the application was stored in a single repository, developers have to go through a long process to review and test if the changes won't break the current system and developers are also limited by the framework and programming language initial develops chosen for the project. + +To tackle these problems there was a new way to develop applications was introduced, it's called "Microservices". The idea behind this concept is to break all the functionalities of big monolith applications into small individually scalable services and give ownership of each service to small teams of people who work separately. With this flow developers are free to use whatever tool they like to develop each service. Because these services are developed parallelly by different teams this increases the development velocity by order of magnitude \citep{Understa56:online}. + +As these services are relatively small and tailor-made to run on cloud environments it's very easy to take something that's running on the developer's local machine to the production cluster in a matter of minutes. This is mainly thanks to modern cloud-native tools like CI/CD pipelines which automatically build and test the code for them, which can save a lot of time spent just doing repetitive tasks which are prone to human errors \citep{Whataret68:online}. + +\section{Monitoring Cloud-Native Applications} \label{monitoring-bg} +Even though cloud-native applications have a lot to offer when it comes to developer velocity and productivity, It has its fair share of issues. Most of these problems are linked to the sheer complexity of these systems and not having a proper way to monitor them \citep{5WaysYou35:online}. All 3 major cloud providers provide a way to monitor these applications efficiently and some great open-source projects do this well, But to take full advantage of those systems, developers have to adapt their services to export all the vitals in a way the monitoring system understand. This works for the most part and this is what all the big companies are doing, even if it takes more developer time to in the end it's very crucial when it comes to disaster recovery. + +But there is still a slight problem with this approach. Once the system starts to scale up to 100s of services number vitals that has to be monitored goes to 1000s and will require a lot of additional \acp{sres} and will have drop lot of non-crucial service vitals and derive abstract \acp{sli} to make it \textbf{humanly} possible to understand what's going on.\\ diff --git a/proposal/sections/introduction.tex b/proposal/sections/introduction.tex new file mode 100644 index 0000000..4c8af17 --- /dev/null +++ b/proposal/sections/introduction.tex @@ -0,0 +1,11 @@ +\chapter{Introduction} + +% Cloud computing is at steady rise for past few years due to its scalability and ease of use. With this change, a new programming paradigm called cloud-native was born. Cloud-native applications are often developed as a set of stand-alone microservices \citep{dragoni2017microservices} yet could depend on each other to provide a unified experience. + +% This helps different teams to work on different services which increases the development velocity. This works well for medium to large companies but over time when this mesh of services could become very complicated to a point where it's very difficult for one person to understand the whole system. When the system consists of 1000s of individual services talking and depending on each other, the network layer of that system becomes chaotic \citep{Introduc54:online} and failure in a single point can create a ripple effect across the system. When something like that happens it's really difficult to zero in on the exact point of failure quickly. + +% In this document author will explain the problem that's getting tackled, why it needs to be solved and how the author is planing to solve the problem within upcoming months. + +% This document describes the problem, research prospects and the course of action for the upcoming months of research. In line with this, proofs of the problem and prior attempts are also explored. Finally, the estimated timelines of the project and expected deliverables are discussed. + +This document was made to provide the necessary context about one of the main pain points that arises when it comes to maintaining distributed systems and a course of actions that could be taken to reduce them. To do that author will first give a brief overview of the target domain and existing steps that have already been taken, then the author talks about shortcomings and improvements that can be made to them. Finally, the document will be concluded with how the author will approach the problem and try to solve it. \ No newline at end of file diff --git a/proposal/sections/methodology/development.tex b/proposal/sections/methodology/development.tex new file mode 100644 index 0000000..ac51c10 --- /dev/null +++ b/proposal/sections/methodology/development.tex @@ -0,0 +1,16 @@ + +{\let\clearpage\relax \chapter{Development Methodology}} + +Even though this project has few clearly defined requirements, designing and developing them will require an iterative model as there isn't a single best way to develop this and the author will be experimenting with different techniques. Thus the author decides on using \textbf{prototyping} as the \ac{sdlc} Model for this project.\\ + +\section{Design Methodology} + +To design the system diagrams for this project \ac{ooad} methods will be used. \ac{ooad} make it easier to design the system iterative and this complement the choice \ac{sdlc} method Prototyping. + +\section{Evaluation Methodology} + +During the literature, the survey author concluded that there are not any specific evaluation metrics for the root cause analysis system other than accuracy and f1 score, and there are not any publicly available datasets or systems to benchmark against. Base-level benchmarks will be carried out to compare the proposed system with the existing ones. + +\section{Requirements Elicitation} + +As the results of this project will be mostly used by \acp{sres} and system administrator the author is hoping to talk with few of the experts in the respective fields to get a better idea on what are the things to be expected from a system like this. Moreover as mentioned in \ref{sec:out-scope} this system is not designed to entirely replace existing monitoring systems, So the author is hoping to research about production monitoring systems and their workflows to understand how the proposed system could seamlessly integrate them. diff --git a/proposal/sections/methodology/project-management.tex b/proposal/sections/methodology/project-management.tex new file mode 100644 index 0000000..7a10922 --- /dev/null +++ b/proposal/sections/methodology/project-management.tex @@ -0,0 +1,139 @@ + +{\let\clearpage\relax \chapter{Project Management Methodology}} + +To manage task of this project authors decide to use \textbf{Agile PRINCE2}. Agile PRINCE2 built upon waterfall method which works best for projects with fixed deadlines and requirements with the added benefit of having regulated inputs and outputs \citep{WhatAreT79:online}. + +\section{Deliverables} +\setlength\LTleft{0mm} +\begin{longtable}{|p{115mm}|p{35mm}|} +\hline +\textbf{Deliverable} & \textbf{Date} \\ \hline +\textbf{Draft Project Proposal} & \multirow{2}{*}{02nd September 2021} \\ +A draft version of this proposal & \\ \hline +\textbf{A working beta of MicroSim}\label{microsim} & \multirow{2}{*}{15th September 2021} \\ +MicroSim is a tool that simulates a distributed system within a Kubernetes cluster. This tool will be used to test and evaluate the final version of this project & \\ \hline +\textbf{Research Paper about MircoSim} & \multirow{2}{*}{16th October 2021} \\ +MicroSim could have various other use-cases and could help in the development of this research domain. So the author is planning to release it as an open-source project with paper so future research and benefits from this. & \\ \hline +\textbf{Literature Review Document} & \multirow{2}{*}{21st October 2021} \\ +The Document explaining all the existing tools and published researches on the domain & \\ \hline +\textbf{Project Proposal} & \multirow{2}{*}{04th November 2021} \\ +The final version of this project proposal. & \\ \hline +\textbf{Software Requirement Specification} & \multirow{2}{*}{25th November 2021} \\ +The Document all the key requirements that are gonna get address with this research & \\ \hline +\textbf{Proof of Concept} & \multirow{2}{*}{06th December 2021} \\ +Unoptimized prototype with all the main features working & \\ \hline +\textbf{Interim Progress Report (IPR)} & \multirow{2}{*}{27th January 2022} \\ +The document explaining all the preliminary findings and the current state of the project & \\ \hline +\textbf{Test and Evaluation Report} & \multirow{2}{*}{17th March 2022} \\ +A document with results of the project and conclusion made from those tests & \\ \hline +\textbf{Draft Project Reports} & \multirow{2}{*}{31st March 2022} \\ +The draft version of the final thesis & \\ \hline +\textbf{Final Research Paper} & \multirow{2}{*}{14th April 2022} \\ +A paper with results about this project & \\ \hline +\textbf{Final Project Report} & \multirow{2}{*}{28th April 2022} \\ +Finalize version of the thesis & \\ \hline +\caption{Deliverables and due dates} +\end{longtable} + + +\newpage +\section{Schedule} +% Gantt chart is a visualization of the task with their respective timelines. Refer Appendix \ref{appendix:gantt-chart} to find the gantt chart for this project. +\begin{figure}[!ht] + % \chapter{Gantt Chart} \label{appendix:gantt-chart} + % \centering + % \includegraphics[width=15cm]{assets/gantt-chart.jpg} + \includegraphics[height=22cm]{assets/gantt-chart.jpg} + \caption{Defined gantt chart for the project (self composed)} +\end{figure} + + +\section{Resource Requirement} + +\subsection{Software Requirements} + +\begin{itemize}[noitemsep,nolistsep] +\item \textbf{Ubuntu / Arch Linux} - Since this project will use \ac{ebpf} as a dependency it will require a Linux kernel based operating system. +\item \textbf{Python / R} - This project has a data science. So using a language with good data science eco-system will make this process easier. +\item \textbf{GoLang / Rust} - While GoLang has official client library made by Kubernetes developers themselves, kube-community has developed an excellent alternative in Rust. +\item \textbf{K3d / Minikube} - To create a Kubernetes cluster locally for development and testing. +\item \textbf{Jetbrain IDEs / VS Code} - IDE provides lot of tools that will help developing complex project like this easily. +\item \textbf{Google Docs / Overleaf} - To create documation about the project the author can use a usual editor like Google Docs or declaratively tool like Overleaf which use coding like style to format the document. +\item \textbf{Google Drive / Github} - Offsite location to backup the codebase and related documents. +\item \textbf{ClickUp / Notion} - To manage the project and keep track of things to be done. +\end{itemize} + +\subsection{Hardware Requirements} +\begin{itemize}[noitemsep,nolistsep] + \item \textbf{Quad-core CPU with AVX support} - AVX is a CPU instruction set which is optimze for vector operations. Having an AVX supported CPU could reduce the model inference time. + \item \textbf{GPU with CUDA support and 2GB or more VRAM} - Both Tensorflow and Pytorch depend on CUDA for hardware-accelerated training. Training on GPU could save a lot of time increases the number of trial and error iterations that could be done. + \item \textbf{16 GB or more Memory} - Running a microservices simulation locally will consume a lot of memory and while testing models will get loaded into RAM. + \item \textbf{At least 40GB disk space} - To store the dataset, models docker containers while developing the project. +\end{itemize} + +\subsection{Skill Requirements} +\begin{itemize}[noitemsep,nolistsep] + \item \textbf{Experience working with Kubernetes} - The author will be developing a Kubernetes extension so they need to know the inner workings of Kubernetes. + \item \textbf{Data engineering} - Developing a data encoding technique requires a lot of knowledge in how to manipulate a given dataset. + \item \textbf{Model engineering} - Creating model from ground up is difficult task. So the author needs to have an in-depth idea about a machine learning framework and how different layers in the model work in order to fit them properly. +\end{itemize} + +\subsection{Data Requirements} +\begin{itemize}[noitemsep,nolistsep] +\item \textbf{Monitoring dataset} - This dataset can be collected using \hyperref[microsim]{MicroSim} tool author plan to develop to simulate distributed system. +\end{itemize} + +\section{Risk Management} + + +\begin{longtable}{|p{4.8cm}|p{1.35cm}|p{1.8cm}|p{7cm}|} + \hline + \textbf{Risk Item} & + \textbf{Severity} & + \textbf{Frequency} & + \textbf{Mitigation Plan} + \\ \hline + + The hypothesis the research is based on is wrong & + 5 & + 1 & + Present the findings and explain why the hypothesis was wrong + \\ \hline + + Failure in work computer & + 4 & + 3 & + Daily backup work the work to a cloud platform + \\ \hline + + Lack of domain knowledge & + 2 & + 3 & + Talk to a domain expert, Do more research + \\ \hline + + Models not generalizing & + 3 & + 4 & + Explore different methods, Try cleaning up the dataset more + \\ \hline + + Dataset quality is not up to the standard & + 4 & + 1 & + Use a method used in related researches to create a new dataset + \\ \hline + + Running out of time & + 1 & + 2 & + Following a thorough work schedule + \\ \hline + + Getting sick and unable to work for few days & + 3 & + 3 & + Keeping few days of a buffer period before deadlines + \\ \hline + \caption{Risks and mitigations} +\end{longtable} \ No newline at end of file diff --git a/proposal/sections/methodology/research.tex b/proposal/sections/methodology/research.tex new file mode 100644 index 0000000..56cbf4a --- /dev/null +++ b/proposal/sections/methodology/research.tex @@ -0,0 +1,31 @@ + +{\let\clearpage\relax \chapter{Research Methodology}} + +% \begin{longtable}{|p{4cm}|p{10cm}|} +% \begin{table} +% \setlength\LTleft{-10mm} +\begin{longtable}{|p{35mm}|p{125mm}|} +\hline + \textbf{Research Philosophy} & + Mainly, there are four research philosophies, Pragmatism, positivism, realism, and interpretivism. It explains the belief and the research is done. After doing an in-depth study about research philosophies, the author decided on following \textbf{Pragmatism} as the research philosophy because the author believes there is no one way to solve the problem this research is tried to address and the goal of this research is to solve a practical problem faced by \acp{sres}. (\cite{1Philoso75:online}, \cite{Pragmati87:online}) + \\ \hline + + \textbf{Research Approach} & + Although the inspiration for the research came from an observation of the real world. The author is using \textbf{deductive reasoning} to approach the problem. After the problem was identified the author looked for existing work found few theories on the domain. Then the author found few flaws in these methods thought of a way to address them with different approaches. At the end of the research other hopes to implement these new approaches and observe their outcome. + \\ \hline + + \textbf{Research Strategy} & + The research strategy will be used to answer the research questions. In this project, the author will use \textbf{experimenting, interviews, and surveys} to provide answers to research questions. + \\ \hline + + \textbf{Research Choice} & + During this research project, the author is planning to build a very generalized solution to predict anomalies. So to achieve this, a \textbf{quantitative} dataset will be used to train the model while a \textbf{qualitative} data set will be used for evaluate it. So the data for this research will be collected by using the \textbf{Mixed method}. + \\ \hline + + \textbf{Time zone} & + This project needs to be completed within 8 months, so a \textbf{cross-sectional} time horizon will be used to collect data to complete the project. + \\ \hline + \caption{Research methodology selection} +\end{longtable} +% \setlength\LTleft{10mm} +% \end{table} \ No newline at end of file diff --git a/proposal/sections/problem.tex b/proposal/sections/problem.tex new file mode 100644 index 0000000..9a4d4e1 --- /dev/null +++ b/proposal/sections/problem.tex @@ -0,0 +1,23 @@ + +% {\let\clearpage\relax \chapter{Problem Domain}} + +% With the rise of cloud-native applications, \citep{CloudAdo16:online} a plethora of services to support these applications came to play. Kubernetes itself is also a tool develop to manage containerized microservices and it tries to solve most of the networking and service discovery challenges when it comes to containerized distributed systems. + +% As discussed in the section \ref{monitoring-bg} monitoring is considered one of the biggest challenges in microservices \citep{Understa56:online}. So to address this issue, 3 types of services were introduced. Log aggregators, distributed tracers, and \ac{apm} systems. With that, few companies started creating \ac{saas} products that integrate all 3 of these and show them under a single pane of glass. Some of the key players in this domain are Datadog, New Relic, and Dynatrace to name a few of many. This works well for the most part and a lot of companies no matter the size depend on these services to handle their observability needs. + +% If we take Datadog as an example it offers a wide variety of futures from simple data collection to metric forecasts for predictive monitoring. If developers configured their services with Datadog agents correctly, Datadog will help its users to visualize services performance from a very high level and when a problem occurs it gives all the tools that need to drill into the core and tally all the logs and performance and tracing data understand what's going on \citep{Datadog18:online}. They even have a module called watchdog which does aggregate all the \acp{apm} data in the background and tries to find issues in them but is currently only available as a private beta. + +% Even though there are a lot of great products available with all has a few common issues, one of the main issues is it's up to developers to implement metric exports and tracing in their services. As a person who did his placement in a small to medium size startup the author first hand, both managers and developers hesitate to spend time on things like these. Although platforms like Datadog support open initiatives like \href{https://opentelemetry.io/}{opentelemetry} to take the full power of these platforms services have to be architectured towards the observability platform that gonna get used, once committed it's very hard to migrate to another solution. Finally, all these services require users to send over all of their key data including logs to get the most out of it and it could open up a lot of security issues and privacy concerns down the road. + +\newpage + +{\let\clearpage\relax \chapter{Problem Definition}} + +One of the main problems in monitoring microservices is the sheer number of data they generate. It's humanly impossible to monitor the metrics of all the services and it's hard for a single person to understand the entire system. To overcome this \acp{sres} use abstracted metrics called \acp{sli} which measure the quality of the service at a higher level. \acp{sli} will tell when there is an issue in the system, but it's very hard to understand where the actual problem is from it along. To understand the root cause of the problem \acp{sres} need to dig into \acp{apm} of all the services and go through the logs of each of the troubling services. + +When the system consists of 100s or 1000s of services that are interdepended it's really hard to find where the actual issue is coming from and it may require the attention from all the service owners of failing services to go through the logs and \acp{apm} and identify the actual root cause of the failure. +This could greatly increase the \ac{mttr} and waste a lot of developer time just looking at logs. \\ + +\section{Problem Statement} + +Modern distributed systems are becoming big and complex so that when a failure happens it requires collaboration with a lot of people to find the root cause. Implementing a machine learning model which will watch over all the services and reacts to anomalies in real-time could greatly reduce the \ac{mttr}.\\ \ No newline at end of file diff --git a/proposal/sections/project-scope.tex b/proposal/sections/project-scope.tex new file mode 100644 index 0000000..cf691dd --- /dev/null +++ b/proposal/sections/project-scope.tex @@ -0,0 +1,60 @@ +{\let\clearpage\relax\chapter{Project Scope}} + +From the literature survey and talking with industry, experts author found many issues they can address when developing the system, but some of those problems like interpretability on autoencoder \citep{ribeiro2016should} are hard to solve by someone at a level of an undergraduate. As this project is done by one developer in less than one year, it won't be possible to create a fully functional monitoring platform like Datadog or New Relic. The Force of this project is to see if the author can develop a single model that can monitor all kinds of services after transfer learning with few examples. \\ + +\newpage + +\section{In-scope} \label{sec:in-scope} +Following are the main forces of this project +\begin{itemize}[noitemsep,nolistsep] + \item Evaluation Framework + \begin{itemize}[noitemsep,nolistsep] + \item Ability to create service mesh out using Kubernetes native resources. + \item Each service has the ability to simulator predefined error types. + \item Service mesh can be made up of services written in different programming languages and frameworks. + \item Built-in method to run stress tests. + \end{itemize} + \item Monitoring System + \begin{itemize}[noitemsep,nolistsep] + \item Low overhead data collection pipeline to collect service telemetry. + \item Reliability system which generate fewer false positives so it won't overwhelm the operators and false negatives will be caught by the main monitoring system. + \item Optimized models to have fairly small memory footprint and a CPU overhead. + \item Well generalized model which will be able to deploy with completely new services and it will learn to adapt the new system. + \end{itemize} +\end{itemize} + + +% \item Constant changes to services +% \item Highly seasonal and noisy patterns +% \item few shot learning to convert to a new system +% \item Tunability +% \item Reponsed to seasonal dependencies +% \end{enumerate} + + + +\section{Out-scope} \label{sec:out-scope} +Follow will not be covered during this project +\begin{itemize}[noitemsep,nolistsep] + \item Evaluation Framework + \begin{itemize}[noitemsep,nolistsep] + \item Support for every major language and framework. + \item Working outside of Kubernetes eco-system. + \end{itemize} + \item Monitoring System + \begin{itemize}[noitemsep,nolistsep] + \item Interpretability - Describing a behavior of autoencoder is a difficult task that won't be covered during the project. + \item System won't be trained against data from a real production system due to the lack of public datasets. + \item System won't have very high accuracy, as this will be the first line of defense this will try to avoid false positives to prevent adding more noise to alerting systems. + \item Automatically identify system topology. + \item This will not be a drop-in replacement for existing monitoring systems, rather this will work with existing monitoring systems to reduce the \ac{mttr}. + \end{itemize} +\end{itemize} + +\section{Prototype Feature Diagram} +\begin{figure}[H] + \centering + \includegraphics[width=16cm]{assets/High-level-system-diagram.png} + \caption{Prototype feature diagram (self composed)} + \label{fig:high-level-diagram} +\end{figure} \ No newline at end of file diff --git a/proposal/sections/related-work.tex b/proposal/sections/related-work.tex new file mode 100644 index 0000000..51dc1e1 --- /dev/null +++ b/proposal/sections/related-work.tex @@ -0,0 +1,203 @@ + + +{\let\clearpage\relax\chapter{Existing Work}} + +\section{Anomaly detection} + +% \setlength\LTleft{-5mm} + +% \begin{longtable}{| p{20mm} | p{47mm} | p{47mm} | p{47mm} |} +\begin{longtable}{| p{20mm} | p{43mm} | p{43mm} | p{43mm} |} +\hline + \textbf{Citation} & + \textbf{Technology summary} & + \textbf{Improvements} & + \textbf{Limitations} \\ \hline + \cite{du2018anomaly} & + Tested most of common machine learning methods to detect anomalies and benchmarked them & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Used SLIs to monitored data + \item A lot of good metrics (input data) + \item Performance monitoring of services and containers + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Only be able to identify predetermined issues + \item Require a sidecar that includes a lot of overhead + \item Won't work with event-driven architectures (this is where most of the new systems are headed) + \item Uses Supervised learning and it's near impossible to find real-world data with labels + \vspace{-7mm} + \end{itemize} \\ \hline + \cite{kumarage2018anomaly} & + The authors here are proposing a semi-supervised technique using a Variational Autoencoder to predict future time steps and calculate the difference between predicted and actual to detect anomalies. & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Due to the difficulty of finding labeled research data, they settled on using a semi-supervised technique. + \item Used randomized decision trees were utilized to select the most suitable features for each component. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item The model won't be easily transformable for other systems + \item If more new key features were added to the system it will require a total retraining + \vspace{-7mm} + \end{itemize} \\ \hline + \cite{kumarage2019generative} & + Uses a bidirectional \ac{gan} to predict future timesteps and uses MSE between prediction and real to determine the anomalies & + Experimented using a \ac{gan} to detect anomalies rather than using conventional autoencoders & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Accuracy is around 60% which is not really good to use in production with mission-critical systems. + \item As this is a \ac{gan}-based system, it may take a lot of resources to run with production systems. + \end{itemize} \\ \hline + \caption{Comparison of anomaly detection methods in distributed systems} +\end{longtable} + +\section{Root cause identification} + +% \begin{longtable}{| p{20mm} | p{47mm} | p{47mm} | p{47mm} |} +\begin{longtable}{| p{20mm} | p{43mm} | p{43mm} | p{43mm} |} +\hline + \textbf{Citation} & + \textbf{Technology summary} & + \textbf{Improvements} & + \textbf{Limitations} \\ \hline + \cite{gonzalez2017root} & + Detect failures in networks, using machine learning to generate knowledge graphs on historical data & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Build a predictable system + \item Automatic identification of dependencies between system events + \item Doesn't Need to rely on Domain experts + \item Generalized to different systems + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Limited to network issues + \item Even though the knowledge graph helped with visualization of the problem but still, people have to manually figure out what went wrong + \vspace{-7mm} + \end{itemize} \\ \hline + \cite{chigurupati2017root} & + Proposed a way to detect Hardware failures in servers using a probabilistic graphical model which concisely describes the relationship between many random variables and their conditional independence & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Find hidden meaning in values that seems random + \item Used a probabilistic approach to better understand the relationship between inputs and outputs + \item Gives all the possible root cause to a given problem + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Limited to hardware issues + \item Require support from domain experts + \item Can't account for unforeseen error + \vspace{-7mm} + \end{itemize} \\ \hline + \cite{samir2019dla} & + This detects and locates the anomalous behavior of microservices based on the observed response time using a \ac{hhmm} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Custom HHMM model + \item Self-healing mechanism + \item Focus on performance detection and identification at the container, node, and microservice level + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Input dataset scale is limited + \item Require a sidecar + \item Needs to predetermined thresholds + \vspace{-7mm} + \end{itemize} \\ \hline + \cite{wu2020microrca} & + Find Performance bottlenecks in distributed systems using an attribute graph to find anomaly propagation across services and machines & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Created a custom Faults Injection module + \item Uses an attribute graph to localize to faulty service + \item Application-agnostic by using a service mesh + \item Rely on service mesh to determine network topology + \item Uses unsupervised learning + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Only able to identify 3 types of issues + \item Looks only for performance anomalies + \item Use the slow response time of a microservice as the definition of an anomaly + \item Service meshes add a lot of overhead to systems + \item Required direct connection between services + \vspace{-7mm} + \end{itemize} \\ \hline + \caption{Comparison of root cause identification methods in distributed systems} +\end{longtable} + +% \newpage +\section{Commercial products} + +% \begin{longtable}{| p{40mm} | p{60mm} | p{60mm} |} +\begin{longtable}{| p{40mm} | p{55mm} | p{55mm} |} +\hline + \textbf{Name} & + \textbf{Futures} & + \textbf{Limitations} \\ \hline + Applied Intelligence by New Relic & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Metric forecasting. + \item Anomaly detection. + \item Alert grouping to reduce noise. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Lack of explainability for certain classifications. + \item All the telemetry data need to be sent to a third party. + \vspace{-7mm} + \end{itemize} \\ \hline + Watchdog by Datadog & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Monitor the metric data of the entire system from the background. + \item Monitor logging data. + \item Highlight relevant components affected by an issue. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Announced in 2018 but is still at private beta. + \item Require code changes and tight integration with datadog platform. + \item Available demos about the system seems to be engineered for demonstration purposes. + \vspace{-7mm} + \end{itemize} \\ \hline + \caption{Comparison of commercial products for root cause analysis} +\end{longtable} +% \setlength\LTleft{5mm} +% As the large-scale migration towards the cloud and microservices started fairly recently the problem this research is trying to solve mostly affects large-scale enterprises there ain't a lot of published research on this domain. All the work done towards uncovering the root cause of failures by large co-operations either kept their finds for internal use to sell it as \ac{saas} product. + +% One of the best implementations found on root cause analysis is from Datadog. They created a platform called watchdog \citep{Watchdog76:online} which monitors the entire system for anomalies and failures in the background. When a failure happens it tries to pull all the relevant stack traces and monitoring data to a single view so the developer can diagnose the problem easily. The problem with this solution is even though it was announced all the way back in July 2018, all that is available is currently in private beta which not everyone has access to. +% \\ +% All the currently published work on microservices monitoring can be classified into 2 categories +% \begin{enumerate} +% \item Anomaly detection +% \item Root cause identification +% \end{enumerate} + +% \section{Anomaly detection} + +% Anomaly detection in time series is a field of its own. So in this case we will be forcing papers that are specialized in the cloud computing domain. + +% One of the earliest attempts on detecting anomalies in microservices was \cite{du2018anomaly}. In this authors tried using 4 different machine learning techniques to detect performance anomalies. To do this say used a simulated system and various fault injection mechanisms to create the dataset. In the end, they concluded K Nearest Neighbors classifier gives the most accurate classifications while Support vector machines have the worse. + +% A common way to detect anomalies in time series is using an autoencoder to reconstruct a given time series. After training the model should be able to come up with the generalized function about the given time series and it will be able to recreate any input sequence accurately. But when there is an anomaly in the input sequence models output will be vastly different from the input. We can use this reconstruction loss as a metric to uncover anomalies within the system. In \cite{kumarage2018anomaly} authors used the method to detect anomalies in distributed systems. In a continuation of their work \cite{kumarage2019generative} they tried doing the same thing by using a \ac{gan} but in the end, they concluded even though it showed a tendency towards better performance when the dataset gets bigger, with the dataset they had autoencoders perform well overall. + +% Ever since DeepMind came up with wavenet which used a CNN to generate audio samples \citep{oord2016wavenet} researchers uncovering other potential use cases other than image-related tasks. One of those use cases was as CNN excels at pattern recognition, encoding time series data set into image-like data structures and use a CNN to identify abnormal patterns in it. On \cite{kim2018encoding} authors tried to using a novel technique to raw encode data into a pixel-like structure and found it could outperform the existing methods to detect anomalies in computer networks. + +% \section{Root cause identification} + +% Predicting the exact root cause of failure just using a standard machine learning model is a pretty difficult task since prediction space is not finite. In 2017 a team from Google X tried using the Bayesian Network to model the relationship between the state of the system and its effect on failures \citep{chigurupati2017root}. Using it they were able to accurately predict all the possible root causes of a hardware failure in certain systems but this model required to predefine all the possible error modes by domain experts which isn't really possible in a constantly evolving distributed system. There were similar attempts \cite{gonzalez2017root} to use machine learning to generate knowledge graphs on historical data and help developers come up with reasoning to failures although this eliminated a need for a domain expert, this also can't react to unseen errors. + +% In a distributed system it's hard to spot real anomalies just by looking at monitoring data, but when there are huge spikes in response latencies or error rates it's a good indicator something must be wrong. So \cite{samir2019dla} used a \ac{hhmm} to uncover the possible affected services from changes in response time or error rates in one service and using that data to uncover the root cause of the issue. All of the papers discussed above have one problem in common they all assume the entire system is static but in reality, these services changes over time either with increased demand or new future implementations. To address this, \cite{wu2020microrca} developed a service that monitors all the running applications and their vitals. This also constructs an attributed graph that represents how each service interacts with the other. When the monitoring system detects an anomaly MicroRCA weight that graph with response time changes and tries to find the epicenter of the anomaly. The main problem with both of these approaches have is authors rely solely on slow response time as an indication of an anomaly but several other factors could course anomalous behaviors without changes in response times. diff --git a/proposal/sections/research/aim.tex b/proposal/sections/research/aim.tex new file mode 100644 index 0000000..de7285d --- /dev/null +++ b/proposal/sections/research/aim.tex @@ -0,0 +1,8 @@ + +{\let\clearpage\relax\chapter{Research Aim}} + +\textit{The aim of this research is to design, develop and evaluate a toolkit to help system operators to reduce the \ac{mttr} when the system is experiencing an anomaly by using a machine learning model investigating all the services in the system and highlighting the most probable root causes in order, So the operators don't have to find a needle in a haystack.} + +To achieve this author tries to create a single model that can monitor all the vitals of a given service and output an anomaly score in any given time window. The author is hoping to make it generalized enough so operators can take the same model and deploy it with other services and the model will adopt the new services with \ac{fsl} \citep{wang2020generalizing}. To do this author is trying to create a data encoding technique to represent monitoring data in a programming language or framework independent way. + +Finally, the author is hoping to develop a playground that easily simulates a distributed system within a Kubernetes cluster so the create system can be tested and evaluated properly and future researches on this domain will have to benchmark framework to evaluate their work. diff --git a/proposal/sections/research/challenge.tex b/proposal/sections/research/challenge.tex new file mode 100644 index 0000000..da8be8d --- /dev/null +++ b/proposal/sections/research/challenge.tex @@ -0,0 +1,10 @@ + +{\let\clearpage\relax\chapter{Research Challenge}} + +Even though this project seems very straightforward and easy to implement from a high level, but it becomes tricky when attempting to reach targets defined in the section \ref{sec:in-scope}. For example, interpretability was one most requested feature from industry experts and a must-have trait for mission-critical systems \citep{ribeiro2016should}. But it was left out of the project scope due to its complexity especially when it comes to an \textbf{undergraduate project}. Other than that following are a few of the more difficult challenges the author is expected to face while conducting the research.\\ + +\begin{itemize}[leftmargin=*] +\item \textbf{Highly seasonal and noisy patterns} - Monitoring metrics on microservices on production tends to have very unpredictable patterns depending on the traffic that's sent to the service. The amount of traffic sent will depend on several external factors that are hard to determine. Modeling both temporal dependencies and interdependencies between monitoring data into a single graph will be very difficult and require a lot of fine-tuning and data engineering. +\item \textbf{Overhead} - Modern deep learning models can solve any problem if we could give it an unlimited amount of data and processing power but In this case, models need to optimize for efficiency over accuracy since having a monitoring system that consumes a lot more resource than the actual target system isn't effective. +\item \textbf{Fit into Kubernetes eco-system} - Kubernetes has become the de-facto standard to managing distributed systems \citep{WhatisCo78:online}. So the author is planning to create a Kubernetes extension that will bridge the connection between monitored service and monitoring model as shown in the figure \ref{fig:high-level-diagram}. But Kubernetes itself has a very steep learning curve, even the original developers themselves admitted it's too hard complex for beginners \cite{Googlead4:online}. +\end{itemize} diff --git a/proposal/sections/research/contribution.tex b/proposal/sections/research/contribution.tex new file mode 100644 index 0000000..b1fe333 --- /dev/null +++ b/proposal/sections/research/contribution.tex @@ -0,0 +1,11 @@ + +{\let\clearpage\relax\chapter{Research Contribution}} + + +\section{Domain Contribution} + +With this research, the author first tries to develop a \textbf{cloud-native solution to create a configurable microservices system}, So this research and future researches will have a standard environment to develop and evaluate their work. The author also hopes to build a lightweight and \textbf{low-overhead data collection pipeline} using \ac{ebpf} to collect telemetry of target services without any instrumentation from the user. + +\section{Knowledge Contribution} + +One of the main problems with monitoring microservices systems is different services can be developed with different programming languages and frameworks and those can contain different levels of noisiness\label{need-for-encoding}. So it's hard for a single model to detect anomalies in any service since some frameworks tend to use more resources while idle than others. So to address this author is trying to come up with an \textbf{encoding method} so the model can be trained to monitor one framework and those learning will still be valid for another framework. With those encoded data the author is hoping to develop a \textbf{convolutional autoencoder that will use unsupervised learning to spot out anomalies in a given data stream}. This may have better performance while using fewer resources convolutional layers are typically lightweight and good at pattern recognition \citep{oord2016wavenet}. Finally, the author is planning to aggregate those predictions from the models into a pre-generated service graph and weigh it to \textbf{find all possible root causes}. diff --git a/proposal/sections/research/gap.tex b/proposal/sections/research/gap.tex new file mode 100644 index 0000000..361fcad --- /dev/null +++ b/proposal/sections/research/gap.tex @@ -0,0 +1,8 @@ + +{\let\clearpage\relax\chapter{Research Gap}} + +After a literature survey author came conclusion finding a root cause of any failure within a distributed system is a very difficult issue due to it not having single output we can try to predict and most researchers have built their own simulation of a distributed system by themselves since there isn't any open dataset about monitoring data mainly because it could contain sensitive information. + +Most currently established researches are done towards creating statistical models like clustering and linear regression. Even though these algorithms perform very well in small-scale systems, they struggle to keep up when the monitoring data become very noisy with scale. Another problem none of these papers properly addressed was constant changes to services. All most published research considers target services as static but in reality, these services can change even many times per day \citep{GoingtoM51:online}. + +After talking with industry experts author concluded three main issues all had with using a machine learning model as monitoring agent Reliability, Interpretability, and Tunability. On reliability, experts said too many false positives will make operators lose faith in the system because it's gonna be another distraction to them. As the operators have to take critical decisions with the output of these models, it has been interpretable by humans \citep{ribeiro2016should}. Finally, this system should act more like a tool rather than a replacement to human operators, because no matter machine learning models cannot compete with the context a human can handle. diff --git a/proposal/sections/research/motivation.tex b/proposal/sections/research/motivation.tex new file mode 100644 index 0000000..0811091 --- /dev/null +++ b/proposal/sections/research/motivation.tex @@ -0,0 +1,5 @@ + +{\let\clearpage\relax\chapter{Research Motivation}} + +Modern distributed systems generate tons of useful and not so useful telemetry data. As the system grows in demand and size, these telemetry data only get nosier and complex \citep{Untangli35:online}. It's difficult for humans to make sense of all these data, especially if they don't have a lot of years of experience with the system. In the other hand, deep learning models thrive when it has a lot of data to learn from. As these models can be trained in computer-simulated environments they can learn concepts humans takes years to grasp within days \citep{OpenAI_dota, silver2017mastering}. Finally, unlike humans a deep learning model can monitor a service 24x7 without taking any breaks which will not only prevent outages even before they happen, It could be reduced \ac{mttr} because the issue can be detected way earlier than any human could do. + diff --git a/proposal/sections/research/objective.tex b/proposal/sections/research/objective.tex new file mode 100644 index 0000000..b5ec62c --- /dev/null +++ b/proposal/sections/research/objective.tex @@ -0,0 +1,135 @@ + +{\let\clearpage\relax\chapter{Research Objectives}} + +\newcommand\robProblemIdentification{ +When selecting the problem author wanted to pursue, they had 3 main goals. +\begin{enumerate}[leftmargin=*,noitemsep,nolistsep] +\item The problem domain should be something they enjoy working in. +\item At the end of the research should have done a meaningful impact on the target domain, both in the theoretical and practical aspect, +\item It should be challenging to achieve and results should speak about themselves. +\vspace{-7mm} +\end{enumerate} +% After many iterations of trial and error the author settled on "Cloud Computing" as the domain, "Root cause analysis" as the problem because the author is a site reliability engineer by profession and quickly able to identifying the root cause of a failure could lower \ac{mttr}. +} + +\newcommand\robLiteratureReview{ +% After a general topic was identified, the author needed to do evaluate all the currently published work to understand what’s the current state of the problem and how other researchers and developers are approaching this problem. After an intensive literature survey author was able to identify a new angle to approach the domain. + +% During this period author contacted few experts in the cloud computing domain and evaluate the idea and plan for the project. +Conduct a Literature review on root cause analysis to, +\begin{itemize}[leftmargin=*,noitemsep,nolistsep] +\item To find the current methods used to anomaly detection and localization. +\item Uncover issues with current approaches. +\item Understand how advancement in other related domains can apply to this domain. +\vspace{-7mm} +\end{itemize} +} + + +\newcommand\robDevelopingEvaluation{ +During the literature survey, one problem the author identified was there isn’t a uniform dataset when it comes to training and evaluating models to detect anomalies in microservices. Most of the researchers used private datasets to train and test their work. +To address this author is developing, +\begin{itemize}[leftmargin=*,noitemsep,nolistsep] +\item A tool that can easily simulate a distributed system in a cloud-native setting. +\item A tool inject anomalies into the running services. +\vspace{-7mm} +\end{itemize} +} + +\newcommand\robPublishPlayground{ +The author is hoping to publish a paper about the above-mentioned tool so the future researchers will have a unified way to train, test, and benchmark their system without having to reinvent the wheel again and again. +} + +\newcommand\robDataGathering{ +% The author plans to use the above-mentioned tool to simulate a large-scale distributed system made up of services done in different frameworks and subject it to a load test. Then collect the monitoring data from that to train the model. +In order to create model to detect anomalies the author will, +\begin{itemize}[leftmargin=*,noitemsep,nolistsep] +\item Simulate distributed system. +\item Simulate traffic inside the system +\item Collect monitoring data while it's running +\vspace{-7mm} +\end{itemize} +} + +\newcommand\robDevelopingEncoding{ +As mentioned in the section \ref{need-for-encoding} these services will report very different values even at idle. To normalize data from all the services to one format author will, +\begin{itemize}[leftmargin=*,noitemsep,nolistsep] +\item Evaluate current data encoding methods like \cite{zhang2019deep}. +\item Find the best one fit and optimize it to this use case. +\item Test if there is any improvement by using this method. +\vspace{-7mm} +\end{itemize} + +% So there needs to be a way to normalize data from all the services to one format so the model can generalize for all the services no matter the framework it was built on. Inspired by \cite{zhang2019deep} the author is trying to develop or adopt an encoding technique to present data in an image-like structure so both ML models and humans can spot out anomalies easily. +} + + +\newcommand\robDevelopingModel{ +% Autoencoders have been outperforming all other types of models \citep{kumarage2019generative} when it comes to anomaly detection. Since this project already has a module that converts raw data to an image-like structure the author is hoping to use a convolution autoencoder which will be lighter and has the potential to outperform normal autoencoders when paired with the above data encoding technique. +According to \cite{kumarage2019generative} Autoencoders tend to perform best when it comes to anomaly detection. But during the literature survey it was raveled Conversational Autoencoders weren't tested. So author tries to develop a Conversational Autoencoders and test how it will perform. +} + + +\newcommand\robTesting{ +Following things will be tested during the testing phase, +\begin{itemize}[leftmargin=*,noitemsep,nolistsep] +\item How will the system classify long-term fluctuations. +\item How will the system classify short-term fluctuations. +\item Can the system understand the mapping between core metrics like CPU and Memory usages. +\item Accuracy of fault detection. +\item Accuracy of root cause localization. +\vspace{-7mm} +\end{itemize} +} +% The author hopes to carry an extensive evaluation on the system with a wide variety of edge cases and the author is hoping to see how the model identifies both short-term and long-term fluctuations and whether it can properly find a mapping between core vitals like CPU and Memory usages. + + +\newcommand\robIntegration{ +Having a fancy model doesn’t add means anything if it’s very hard to use in a real system. So the author is hoping to develop a Kubernetes extension that will map the model with any service given by the user. +} + + +% \begin{table}[] +% \setlength\LTleft{-5mm} +\begin{longtable}{|p{38mm}|p{95mm}|p{17mm}|} +% \begin{longtable}{|p{40mm}|p{100mm}|p{20mm}|} +\hline +\textbf{Research Objectives} & \textbf{Explanation} & \textbf{Learning Outcome} \\ \hline +Problem identification & \robProblemIdentification & LO1 \\ \hline +Literature review & \robLiteratureReview & LO3, LO4, LO6 \\ \hline +Developing an evaluation framework & \robDevelopingEvaluation & LO7 \\ \hline +Publish a paper about that playground & \robPublishPlayground & LO7 \\ \hline +Data gathering and analysis & \robDataGathering & LO7 \\ \hline +Developing encoding method & \robDevelopingEncoding & LO2, LO5, LO7 \\ \hline +Developing the model & \robDevelopingModel & LO2, LO5, LO7 \\ \hline +Testing and evaluation & \robTesting & LO8, LO9 \\ \hline +Integration & \robIntegration & LO7 \\ \hline +\caption{Research objectives} +\end{longtable} +% \setlength\LTleft{0mm} +% \end{table} +% \subsection{Project Objectives} +% \begin{itemize} +% \item Find area I am interested in +% \item Find a issue in it +% \item Evaluate the issue with experts in the field +% \item Create PID +% \item Create playground to test the final product in +% \item Publish a paper about that playground +% \item Finalize on requirements +% \item Develop prototype and Document the progress +% \item Create operator to plugin any ML model to monitor +% \item Publish paper with results +% \item complete thesis +% \item conclude the project +% \item open source the code +% \end{itemize} + +% \subsection{Research Objectives} +% \begin{itemize} +% \item Literature Survey +% \item Requirement Analysis +% \item Design +% \item Development +% \item Testing +% \end{itemize} diff --git a/proposal/sections/research/question.tex b/proposal/sections/research/question.tex new file mode 100644 index 0000000..2a85bdb --- /dev/null +++ b/proposal/sections/research/question.tex @@ -0,0 +1,17 @@ + +{\let\clearpage\relax\chapter{Research Question}} + + +\begin{enumerate}[leftmargin=*,label=\textbf{RQ\arabic*:}] + +\item How can a machine learning model improve \ac{mttr} in a distributed system? + +\item What is the most efficient way to present raw data monitoring to machine learning model? + +\item What will be the most ideal machine learning model to uncover anomalies in a microservice? + +\item What are the methods that can be used to evaluate a root cause prediction system? + +\end{enumerate} + + diff --git a/proposal/sections/research/sections/appendix.tex b/proposal/sections/research/sections/appendix.tex new file mode 100644 index 0000000..e400638 --- /dev/null +++ b/proposal/sections/research/sections/appendix.tex @@ -0,0 +1,5 @@ +\begin{figure}[!ht] + \chapter{Gantt Chart} \label{appendix:gantt-chart} + \centering + \includegraphics[height=22cm]{assets/gantt-chart.png} +\end{figure} \ No newline at end of file diff --git a/proposal/sections/research/sections/background.tex b/proposal/sections/research/sections/background.tex new file mode 100644 index 0000000..4647280 --- /dev/null +++ b/proposal/sections/research/sections/background.tex @@ -0,0 +1,19 @@ + + +{\let\clearpage\relax \chapter{Problem Background}} +% {\let\clearpage\relax \chapter{Problem Domain}} + +\section{Cloud Computing} +With an emergence \ac{iaas} like Amazon Web Services (AWS) and Google Cloud Platform (GCP) there is a big surge in organizations trying to outsource their computing needs to third parties \citep{rimol_2021}. This is mainly due to the elasticity given by all the cloud providers. Users can easily scale up and down their infrastructures within minutes without making any commitment and all the major providers, bill users on what you use are what you pay model because cloud provider manages all the underlying infrastructure users doesn't have to worry about problems like hardware failures. In contrast in a self-hosted setting if the user wanted one extra GB of memory than what's available it requires a lot of effort and cost to full fill that requirement. + +\section{Cloud-Native Applications} +During the 90s and early 2000s, all the applications were made as a big monolith from a single code base \citep{LessonsF52:online}. Most of them were shipped as a single binary. Since those days applications were fairly simple this worked very well with little to no downsides. But when the 2010s came around there were a lot of specialized frameworks and programming languages and marketing teams wanted a lot of new futures quickly developed still maintaining reliability \citep{di2018migrating,Microser52:online}. But if the code base of the application was stored in a single repository, developers have to go through a long process to review and test if the changes won't break the current system and developers are also limited by the framework and programming language initial develops chosen for the project. + +To tackle these problems there was a new way to develop applications was introduced, it's called "Microservices". The idea behind this concept is to break all the functionalities of big monolith applications into small individually scalable services and give ownership of each service to small teams of people who work separately. With this flow developers are free to use whatever tool they like to develop each service. Because these services are developed parallelly by different teams this increases the development velocity by order of magnitude \citep{Understa56:online}. + +As these services are relatively small and tailor-made to run on cloud environments it's very easy to take something that's running on the developer's local machine to the production cluster in a matter of minutes. This is mainly thanks to modern cloud-native tools like CI/CD pipelines which automatically build and test the code for them, which can save a lot of time spent just doing repetitive tasks which are prone to human errors \citep{Whataret68:online}. + +\section{Monitoring Cloud-Native Applications} \label{monitoring-bg} +Even though cloud-native applications have a lot to offer when it comes to developer velocity and productivity, It has its fair share of issues. Most of these problems are linked to the sheer complexity of these systems and not having a proper way to monitor them \citep{5WaysYou35:online}. All 3 major cloud providers provide a way to monitor these applications efficiently and some great open-source projects do this well, But to take full advantage of those systems, developers have to adapt their services to export all the vitals in a way the monitoring system understand. This works for the most part and this is what all the big companies are doing, even if it takes more developer time to in the end it's very crucial when it comes to disaster recovery. + +But there is still a slight problem with this approach. Once the system starts to scale up to 100s of services number vitals that has to be monitored goes to 1000s and will require a lot of additional \acp{sres} and will have drop lot of non-crucial service vitals and derive abstract \acp{sli} to make it \textbf{humanly} possible to understand what's going on.\\ diff --git a/proposal/sections/research/sections/introduction.tex b/proposal/sections/research/sections/introduction.tex new file mode 100644 index 0000000..4c8af17 --- /dev/null +++ b/proposal/sections/research/sections/introduction.tex @@ -0,0 +1,11 @@ +\chapter{Introduction} + +% Cloud computing is at steady rise for past few years due to its scalability and ease of use. With this change, a new programming paradigm called cloud-native was born. Cloud-native applications are often developed as a set of stand-alone microservices \citep{dragoni2017microservices} yet could depend on each other to provide a unified experience. + +% This helps different teams to work on different services which increases the development velocity. This works well for medium to large companies but over time when this mesh of services could become very complicated to a point where it's very difficult for one person to understand the whole system. When the system consists of 1000s of individual services talking and depending on each other, the network layer of that system becomes chaotic \citep{Introduc54:online} and failure in a single point can create a ripple effect across the system. When something like that happens it's really difficult to zero in on the exact point of failure quickly. + +% In this document author will explain the problem that's getting tackled, why it needs to be solved and how the author is planing to solve the problem within upcoming months. + +% This document describes the problem, research prospects and the course of action for the upcoming months of research. In line with this, proofs of the problem and prior attempts are also explored. Finally, the estimated timelines of the project and expected deliverables are discussed. + +This document was made to provide the necessary context about one of the main pain points that arises when it comes to maintaining distributed systems and a course of actions that could be taken to reduce them. To do that author will first give a brief overview of the target domain and existing steps that have already been taken, then the author talks about shortcomings and improvements that can be made to them. Finally, the document will be concluded with how the author will approach the problem and try to solve it. \ No newline at end of file diff --git a/proposal/sections/research/sections/methodology/development.tex b/proposal/sections/research/sections/methodology/development.tex new file mode 100644 index 0000000..ac51c10 --- /dev/null +++ b/proposal/sections/research/sections/methodology/development.tex @@ -0,0 +1,16 @@ + +{\let\clearpage\relax \chapter{Development Methodology}} + +Even though this project has few clearly defined requirements, designing and developing them will require an iterative model as there isn't a single best way to develop this and the author will be experimenting with different techniques. Thus the author decides on using \textbf{prototyping} as the \ac{sdlc} Model for this project.\\ + +\section{Design Methodology} + +To design the system diagrams for this project \ac{ooad} methods will be used. \ac{ooad} make it easier to design the system iterative and this complement the choice \ac{sdlc} method Prototyping. + +\section{Evaluation Methodology} + +During the literature, the survey author concluded that there are not any specific evaluation metrics for the root cause analysis system other than accuracy and f1 score, and there are not any publicly available datasets or systems to benchmark against. Base-level benchmarks will be carried out to compare the proposed system with the existing ones. + +\section{Requirements Elicitation} + +As the results of this project will be mostly used by \acp{sres} and system administrator the author is hoping to talk with few of the experts in the respective fields to get a better idea on what are the things to be expected from a system like this. Moreover as mentioned in \ref{sec:out-scope} this system is not designed to entirely replace existing monitoring systems, So the author is hoping to research about production monitoring systems and their workflows to understand how the proposed system could seamlessly integrate them. diff --git a/proposal/sections/research/sections/methodology/project-management.tex b/proposal/sections/research/sections/methodology/project-management.tex new file mode 100644 index 0000000..7a10922 --- /dev/null +++ b/proposal/sections/research/sections/methodology/project-management.tex @@ -0,0 +1,139 @@ + +{\let\clearpage\relax \chapter{Project Management Methodology}} + +To manage task of this project authors decide to use \textbf{Agile PRINCE2}. Agile PRINCE2 built upon waterfall method which works best for projects with fixed deadlines and requirements with the added benefit of having regulated inputs and outputs \citep{WhatAreT79:online}. + +\section{Deliverables} +\setlength\LTleft{0mm} +\begin{longtable}{|p{115mm}|p{35mm}|} +\hline +\textbf{Deliverable} & \textbf{Date} \\ \hline +\textbf{Draft Project Proposal} & \multirow{2}{*}{02nd September 2021} \\ +A draft version of this proposal & \\ \hline +\textbf{A working beta of MicroSim}\label{microsim} & \multirow{2}{*}{15th September 2021} \\ +MicroSim is a tool that simulates a distributed system within a Kubernetes cluster. This tool will be used to test and evaluate the final version of this project & \\ \hline +\textbf{Research Paper about MircoSim} & \multirow{2}{*}{16th October 2021} \\ +MicroSim could have various other use-cases and could help in the development of this research domain. So the author is planning to release it as an open-source project with paper so future research and benefits from this. & \\ \hline +\textbf{Literature Review Document} & \multirow{2}{*}{21st October 2021} \\ +The Document explaining all the existing tools and published researches on the domain & \\ \hline +\textbf{Project Proposal} & \multirow{2}{*}{04th November 2021} \\ +The final version of this project proposal. & \\ \hline +\textbf{Software Requirement Specification} & \multirow{2}{*}{25th November 2021} \\ +The Document all the key requirements that are gonna get address with this research & \\ \hline +\textbf{Proof of Concept} & \multirow{2}{*}{06th December 2021} \\ +Unoptimized prototype with all the main features working & \\ \hline +\textbf{Interim Progress Report (IPR)} & \multirow{2}{*}{27th January 2022} \\ +The document explaining all the preliminary findings and the current state of the project & \\ \hline +\textbf{Test and Evaluation Report} & \multirow{2}{*}{17th March 2022} \\ +A document with results of the project and conclusion made from those tests & \\ \hline +\textbf{Draft Project Reports} & \multirow{2}{*}{31st March 2022} \\ +The draft version of the final thesis & \\ \hline +\textbf{Final Research Paper} & \multirow{2}{*}{14th April 2022} \\ +A paper with results about this project & \\ \hline +\textbf{Final Project Report} & \multirow{2}{*}{28th April 2022} \\ +Finalize version of the thesis & \\ \hline +\caption{Deliverables and due dates} +\end{longtable} + + +\newpage +\section{Schedule} +% Gantt chart is a visualization of the task with their respective timelines. Refer Appendix \ref{appendix:gantt-chart} to find the gantt chart for this project. +\begin{figure}[!ht] + % \chapter{Gantt Chart} \label{appendix:gantt-chart} + % \centering + % \includegraphics[width=15cm]{assets/gantt-chart.jpg} + \includegraphics[height=22cm]{assets/gantt-chart.jpg} + \caption{Defined gantt chart for the project (self composed)} +\end{figure} + + +\section{Resource Requirement} + +\subsection{Software Requirements} + +\begin{itemize}[noitemsep,nolistsep] +\item \textbf{Ubuntu / Arch Linux} - Since this project will use \ac{ebpf} as a dependency it will require a Linux kernel based operating system. +\item \textbf{Python / R} - This project has a data science. So using a language with good data science eco-system will make this process easier. +\item \textbf{GoLang / Rust} - While GoLang has official client library made by Kubernetes developers themselves, kube-community has developed an excellent alternative in Rust. +\item \textbf{K3d / Minikube} - To create a Kubernetes cluster locally for development and testing. +\item \textbf{Jetbrain IDEs / VS Code} - IDE provides lot of tools that will help developing complex project like this easily. +\item \textbf{Google Docs / Overleaf} - To create documation about the project the author can use a usual editor like Google Docs or declaratively tool like Overleaf which use coding like style to format the document. +\item \textbf{Google Drive / Github} - Offsite location to backup the codebase and related documents. +\item \textbf{ClickUp / Notion} - To manage the project and keep track of things to be done. +\end{itemize} + +\subsection{Hardware Requirements} +\begin{itemize}[noitemsep,nolistsep] + \item \textbf{Quad-core CPU with AVX support} - AVX is a CPU instruction set which is optimze for vector operations. Having an AVX supported CPU could reduce the model inference time. + \item \textbf{GPU with CUDA support and 2GB or more VRAM} - Both Tensorflow and Pytorch depend on CUDA for hardware-accelerated training. Training on GPU could save a lot of time increases the number of trial and error iterations that could be done. + \item \textbf{16 GB or more Memory} - Running a microservices simulation locally will consume a lot of memory and while testing models will get loaded into RAM. + \item \textbf{At least 40GB disk space} - To store the dataset, models docker containers while developing the project. +\end{itemize} + +\subsection{Skill Requirements} +\begin{itemize}[noitemsep,nolistsep] + \item \textbf{Experience working with Kubernetes} - The author will be developing a Kubernetes extension so they need to know the inner workings of Kubernetes. + \item \textbf{Data engineering} - Developing a data encoding technique requires a lot of knowledge in how to manipulate a given dataset. + \item \textbf{Model engineering} - Creating model from ground up is difficult task. So the author needs to have an in-depth idea about a machine learning framework and how different layers in the model work in order to fit them properly. +\end{itemize} + +\subsection{Data Requirements} +\begin{itemize}[noitemsep,nolistsep] +\item \textbf{Monitoring dataset} - This dataset can be collected using \hyperref[microsim]{MicroSim} tool author plan to develop to simulate distributed system. +\end{itemize} + +\section{Risk Management} + + +\begin{longtable}{|p{4.8cm}|p{1.35cm}|p{1.8cm}|p{7cm}|} + \hline + \textbf{Risk Item} & + \textbf{Severity} & + \textbf{Frequency} & + \textbf{Mitigation Plan} + \\ \hline + + The hypothesis the research is based on is wrong & + 5 & + 1 & + Present the findings and explain why the hypothesis was wrong + \\ \hline + + Failure in work computer & + 4 & + 3 & + Daily backup work the work to a cloud platform + \\ \hline + + Lack of domain knowledge & + 2 & + 3 & + Talk to a domain expert, Do more research + \\ \hline + + Models not generalizing & + 3 & + 4 & + Explore different methods, Try cleaning up the dataset more + \\ \hline + + Dataset quality is not up to the standard & + 4 & + 1 & + Use a method used in related researches to create a new dataset + \\ \hline + + Running out of time & + 1 & + 2 & + Following a thorough work schedule + \\ \hline + + Getting sick and unable to work for few days & + 3 & + 3 & + Keeping few days of a buffer period before deadlines + \\ \hline + \caption{Risks and mitigations} +\end{longtable} \ No newline at end of file diff --git a/proposal/sections/research/sections/methodology/research.tex b/proposal/sections/research/sections/methodology/research.tex new file mode 100644 index 0000000..56cbf4a --- /dev/null +++ b/proposal/sections/research/sections/methodology/research.tex @@ -0,0 +1,31 @@ + +{\let\clearpage\relax \chapter{Research Methodology}} + +% \begin{longtable}{|p{4cm}|p{10cm}|} +% \begin{table} +% \setlength\LTleft{-10mm} +\begin{longtable}{|p{35mm}|p{125mm}|} +\hline + \textbf{Research Philosophy} & + Mainly, there are four research philosophies, Pragmatism, positivism, realism, and interpretivism. It explains the belief and the research is done. After doing an in-depth study about research philosophies, the author decided on following \textbf{Pragmatism} as the research philosophy because the author believes there is no one way to solve the problem this research is tried to address and the goal of this research is to solve a practical problem faced by \acp{sres}. (\cite{1Philoso75:online}, \cite{Pragmati87:online}) + \\ \hline + + \textbf{Research Approach} & + Although the inspiration for the research came from an observation of the real world. The author is using \textbf{deductive reasoning} to approach the problem. After the problem was identified the author looked for existing work found few theories on the domain. Then the author found few flaws in these methods thought of a way to address them with different approaches. At the end of the research other hopes to implement these new approaches and observe their outcome. + \\ \hline + + \textbf{Research Strategy} & + The research strategy will be used to answer the research questions. In this project, the author will use \textbf{experimenting, interviews, and surveys} to provide answers to research questions. + \\ \hline + + \textbf{Research Choice} & + During this research project, the author is planning to build a very generalized solution to predict anomalies. So to achieve this, a \textbf{quantitative} dataset will be used to train the model while a \textbf{qualitative} data set will be used for evaluate it. So the data for this research will be collected by using the \textbf{Mixed method}. + \\ \hline + + \textbf{Time zone} & + This project needs to be completed within 8 months, so a \textbf{cross-sectional} time horizon will be used to collect data to complete the project. + \\ \hline + \caption{Research methodology selection} +\end{longtable} +% \setlength\LTleft{10mm} +% \end{table} \ No newline at end of file diff --git a/proposal/sections/research/sections/problem.tex b/proposal/sections/research/sections/problem.tex new file mode 100644 index 0000000..9a4d4e1 --- /dev/null +++ b/proposal/sections/research/sections/problem.tex @@ -0,0 +1,23 @@ + +% {\let\clearpage\relax \chapter{Problem Domain}} + +% With the rise of cloud-native applications, \citep{CloudAdo16:online} a plethora of services to support these applications came to play. Kubernetes itself is also a tool develop to manage containerized microservices and it tries to solve most of the networking and service discovery challenges when it comes to containerized distributed systems. + +% As discussed in the section \ref{monitoring-bg} monitoring is considered one of the biggest challenges in microservices \citep{Understa56:online}. So to address this issue, 3 types of services were introduced. Log aggregators, distributed tracers, and \ac{apm} systems. With that, few companies started creating \ac{saas} products that integrate all 3 of these and show them under a single pane of glass. Some of the key players in this domain are Datadog, New Relic, and Dynatrace to name a few of many. This works well for the most part and a lot of companies no matter the size depend on these services to handle their observability needs. + +% If we take Datadog as an example it offers a wide variety of futures from simple data collection to metric forecasts for predictive monitoring. If developers configured their services with Datadog agents correctly, Datadog will help its users to visualize services performance from a very high level and when a problem occurs it gives all the tools that need to drill into the core and tally all the logs and performance and tracing data understand what's going on \citep{Datadog18:online}. They even have a module called watchdog which does aggregate all the \acp{apm} data in the background and tries to find issues in them but is currently only available as a private beta. + +% Even though there are a lot of great products available with all has a few common issues, one of the main issues is it's up to developers to implement metric exports and tracing in their services. As a person who did his placement in a small to medium size startup the author first hand, both managers and developers hesitate to spend time on things like these. Although platforms like Datadog support open initiatives like \href{https://opentelemetry.io/}{opentelemetry} to take the full power of these platforms services have to be architectured towards the observability platform that gonna get used, once committed it's very hard to migrate to another solution. Finally, all these services require users to send over all of their key data including logs to get the most out of it and it could open up a lot of security issues and privacy concerns down the road. + +\newpage + +{\let\clearpage\relax \chapter{Problem Definition}} + +One of the main problems in monitoring microservices is the sheer number of data they generate. It's humanly impossible to monitor the metrics of all the services and it's hard for a single person to understand the entire system. To overcome this \acp{sres} use abstracted metrics called \acp{sli} which measure the quality of the service at a higher level. \acp{sli} will tell when there is an issue in the system, but it's very hard to understand where the actual problem is from it along. To understand the root cause of the problem \acp{sres} need to dig into \acp{apm} of all the services and go through the logs of each of the troubling services. + +When the system consists of 100s or 1000s of services that are interdepended it's really hard to find where the actual issue is coming from and it may require the attention from all the service owners of failing services to go through the logs and \acp{apm} and identify the actual root cause of the failure. +This could greatly increase the \ac{mttr} and waste a lot of developer time just looking at logs. \\ + +\section{Problem Statement} + +Modern distributed systems are becoming big and complex so that when a failure happens it requires collaboration with a lot of people to find the root cause. Implementing a machine learning model which will watch over all the services and reacts to anomalies in real-time could greatly reduce the \ac{mttr}.\\ \ No newline at end of file diff --git a/proposal/sections/research/sections/project-scope.tex b/proposal/sections/research/sections/project-scope.tex new file mode 100644 index 0000000..cf691dd --- /dev/null +++ b/proposal/sections/research/sections/project-scope.tex @@ -0,0 +1,60 @@ +{\let\clearpage\relax\chapter{Project Scope}} + +From the literature survey and talking with industry, experts author found many issues they can address when developing the system, but some of those problems like interpretability on autoencoder \citep{ribeiro2016should} are hard to solve by someone at a level of an undergraduate. As this project is done by one developer in less than one year, it won't be possible to create a fully functional monitoring platform like Datadog or New Relic. The Force of this project is to see if the author can develop a single model that can monitor all kinds of services after transfer learning with few examples. \\ + +\newpage + +\section{In-scope} \label{sec:in-scope} +Following are the main forces of this project +\begin{itemize}[noitemsep,nolistsep] + \item Evaluation Framework + \begin{itemize}[noitemsep,nolistsep] + \item Ability to create service mesh out using Kubernetes native resources. + \item Each service has the ability to simulator predefined error types. + \item Service mesh can be made up of services written in different programming languages and frameworks. + \item Built-in method to run stress tests. + \end{itemize} + \item Monitoring System + \begin{itemize}[noitemsep,nolistsep] + \item Low overhead data collection pipeline to collect service telemetry. + \item Reliability system which generate fewer false positives so it won't overwhelm the operators and false negatives will be caught by the main monitoring system. + \item Optimized models to have fairly small memory footprint and a CPU overhead. + \item Well generalized model which will be able to deploy with completely new services and it will learn to adapt the new system. + \end{itemize} +\end{itemize} + + +% \item Constant changes to services +% \item Highly seasonal and noisy patterns +% \item few shot learning to convert to a new system +% \item Tunability +% \item Reponsed to seasonal dependencies +% \end{enumerate} + + + +\section{Out-scope} \label{sec:out-scope} +Follow will not be covered during this project +\begin{itemize}[noitemsep,nolistsep] + \item Evaluation Framework + \begin{itemize}[noitemsep,nolistsep] + \item Support for every major language and framework. + \item Working outside of Kubernetes eco-system. + \end{itemize} + \item Monitoring System + \begin{itemize}[noitemsep,nolistsep] + \item Interpretability - Describing a behavior of autoencoder is a difficult task that won't be covered during the project. + \item System won't be trained against data from a real production system due to the lack of public datasets. + \item System won't have very high accuracy, as this will be the first line of defense this will try to avoid false positives to prevent adding more noise to alerting systems. + \item Automatically identify system topology. + \item This will not be a drop-in replacement for existing monitoring systems, rather this will work with existing monitoring systems to reduce the \ac{mttr}. + \end{itemize} +\end{itemize} + +\section{Prototype Feature Diagram} +\begin{figure}[H] + \centering + \includegraphics[width=16cm]{assets/High-level-system-diagram.png} + \caption{Prototype feature diagram (self composed)} + \label{fig:high-level-diagram} +\end{figure} \ No newline at end of file diff --git a/proposal/sections/research/sections/related-work.tex b/proposal/sections/research/sections/related-work.tex new file mode 100644 index 0000000..51dc1e1 --- /dev/null +++ b/proposal/sections/research/sections/related-work.tex @@ -0,0 +1,203 @@ + + +{\let\clearpage\relax\chapter{Existing Work}} + +\section{Anomaly detection} + +% \setlength\LTleft{-5mm} + +% \begin{longtable}{| p{20mm} | p{47mm} | p{47mm} | p{47mm} |} +\begin{longtable}{| p{20mm} | p{43mm} | p{43mm} | p{43mm} |} +\hline + \textbf{Citation} & + \textbf{Technology summary} & + \textbf{Improvements} & + \textbf{Limitations} \\ \hline + \cite{du2018anomaly} & + Tested most of common machine learning methods to detect anomalies and benchmarked them & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Used SLIs to monitored data + \item A lot of good metrics (input data) + \item Performance monitoring of services and containers + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Only be able to identify predetermined issues + \item Require a sidecar that includes a lot of overhead + \item Won't work with event-driven architectures (this is where most of the new systems are headed) + \item Uses Supervised learning and it's near impossible to find real-world data with labels + \vspace{-7mm} + \end{itemize} \\ \hline + \cite{kumarage2018anomaly} & + The authors here are proposing a semi-supervised technique using a Variational Autoencoder to predict future time steps and calculate the difference between predicted and actual to detect anomalies. & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Due to the difficulty of finding labeled research data, they settled on using a semi-supervised technique. + \item Used randomized decision trees were utilized to select the most suitable features for each component. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item The model won't be easily transformable for other systems + \item If more new key features were added to the system it will require a total retraining + \vspace{-7mm} + \end{itemize} \\ \hline + \cite{kumarage2019generative} & + Uses a bidirectional \ac{gan} to predict future timesteps and uses MSE between prediction and real to determine the anomalies & + Experimented using a \ac{gan} to detect anomalies rather than using conventional autoencoders & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Accuracy is around 60% which is not really good to use in production with mission-critical systems. + \item As this is a \ac{gan}-based system, it may take a lot of resources to run with production systems. + \end{itemize} \\ \hline + \caption{Comparison of anomaly detection methods in distributed systems} +\end{longtable} + +\section{Root cause identification} + +% \begin{longtable}{| p{20mm} | p{47mm} | p{47mm} | p{47mm} |} +\begin{longtable}{| p{20mm} | p{43mm} | p{43mm} | p{43mm} |} +\hline + \textbf{Citation} & + \textbf{Technology summary} & + \textbf{Improvements} & + \textbf{Limitations} \\ \hline + \cite{gonzalez2017root} & + Detect failures in networks, using machine learning to generate knowledge graphs on historical data & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Build a predictable system + \item Automatic identification of dependencies between system events + \item Doesn't Need to rely on Domain experts + \item Generalized to different systems + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Limited to network issues + \item Even though the knowledge graph helped with visualization of the problem but still, people have to manually figure out what went wrong + \vspace{-7mm} + \end{itemize} \\ \hline + \cite{chigurupati2017root} & + Proposed a way to detect Hardware failures in servers using a probabilistic graphical model which concisely describes the relationship between many random variables and their conditional independence & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Find hidden meaning in values that seems random + \item Used a probabilistic approach to better understand the relationship between inputs and outputs + \item Gives all the possible root cause to a given problem + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Limited to hardware issues + \item Require support from domain experts + \item Can't account for unforeseen error + \vspace{-7mm} + \end{itemize} \\ \hline + \cite{samir2019dla} & + This detects and locates the anomalous behavior of microservices based on the observed response time using a \ac{hhmm} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Custom HHMM model + \item Self-healing mechanism + \item Focus on performance detection and identification at the container, node, and microservice level + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Input dataset scale is limited + \item Require a sidecar + \item Needs to predetermined thresholds + \vspace{-7mm} + \end{itemize} \\ \hline + \cite{wu2020microrca} & + Find Performance bottlenecks in distributed systems using an attribute graph to find anomaly propagation across services and machines & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Created a custom Faults Injection module + \item Uses an attribute graph to localize to faulty service + \item Application-agnostic by using a service mesh + \item Rely on service mesh to determine network topology + \item Uses unsupervised learning + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Only able to identify 3 types of issues + \item Looks only for performance anomalies + \item Use the slow response time of a microservice as the definition of an anomaly + \item Service meshes add a lot of overhead to systems + \item Required direct connection between services + \vspace{-7mm} + \end{itemize} \\ \hline + \caption{Comparison of root cause identification methods in distributed systems} +\end{longtable} + +% \newpage +\section{Commercial products} + +% \begin{longtable}{| p{40mm} | p{60mm} | p{60mm} |} +\begin{longtable}{| p{40mm} | p{55mm} | p{55mm} |} +\hline + \textbf{Name} & + \textbf{Futures} & + \textbf{Limitations} \\ \hline + Applied Intelligence by New Relic & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Metric forecasting. + \item Anomaly detection. + \item Alert grouping to reduce noise. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Lack of explainability for certain classifications. + \item All the telemetry data need to be sent to a third party. + \vspace{-7mm} + \end{itemize} \\ \hline + Watchdog by Datadog & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Monitor the metric data of the entire system from the background. + \item Monitor logging data. + \item Highlight relevant components affected by an issue. + \vspace{-7mm} + \end{itemize} & + \vspace{-8mm} + \begin{itemize}[leftmargin=*,noitemsep,nolistsep] + \item Announced in 2018 but is still at private beta. + \item Require code changes and tight integration with datadog platform. + \item Available demos about the system seems to be engineered for demonstration purposes. + \vspace{-7mm} + \end{itemize} \\ \hline + \caption{Comparison of commercial products for root cause analysis} +\end{longtable} +% \setlength\LTleft{5mm} +% As the large-scale migration towards the cloud and microservices started fairly recently the problem this research is trying to solve mostly affects large-scale enterprises there ain't a lot of published research on this domain. All the work done towards uncovering the root cause of failures by large co-operations either kept their finds for internal use to sell it as \ac{saas} product. + +% One of the best implementations found on root cause analysis is from Datadog. They created a platform called watchdog \citep{Watchdog76:online} which monitors the entire system for anomalies and failures in the background. When a failure happens it tries to pull all the relevant stack traces and monitoring data to a single view so the developer can diagnose the problem easily. The problem with this solution is even though it was announced all the way back in July 2018, all that is available is currently in private beta which not everyone has access to. +% \\ +% All the currently published work on microservices monitoring can be classified into 2 categories +% \begin{enumerate} +% \item Anomaly detection +% \item Root cause identification +% \end{enumerate} + +% \section{Anomaly detection} + +% Anomaly detection in time series is a field of its own. So in this case we will be forcing papers that are specialized in the cloud computing domain. + +% One of the earliest attempts on detecting anomalies in microservices was \cite{du2018anomaly}. In this authors tried using 4 different machine learning techniques to detect performance anomalies. To do this say used a simulated system and various fault injection mechanisms to create the dataset. In the end, they concluded K Nearest Neighbors classifier gives the most accurate classifications while Support vector machines have the worse. + +% A common way to detect anomalies in time series is using an autoencoder to reconstruct a given time series. After training the model should be able to come up with the generalized function about the given time series and it will be able to recreate any input sequence accurately. But when there is an anomaly in the input sequence models output will be vastly different from the input. We can use this reconstruction loss as a metric to uncover anomalies within the system. In \cite{kumarage2018anomaly} authors used the method to detect anomalies in distributed systems. In a continuation of their work \cite{kumarage2019generative} they tried doing the same thing by using a \ac{gan} but in the end, they concluded even though it showed a tendency towards better performance when the dataset gets bigger, with the dataset they had autoencoders perform well overall. + +% Ever since DeepMind came up with wavenet which used a CNN to generate audio samples \citep{oord2016wavenet} researchers uncovering other potential use cases other than image-related tasks. One of those use cases was as CNN excels at pattern recognition, encoding time series data set into image-like data structures and use a CNN to identify abnormal patterns in it. On \cite{kim2018encoding} authors tried to using a novel technique to raw encode data into a pixel-like structure and found it could outperform the existing methods to detect anomalies in computer networks. + +% \section{Root cause identification} + +% Predicting the exact root cause of failure just using a standard machine learning model is a pretty difficult task since prediction space is not finite. In 2017 a team from Google X tried using the Bayesian Network to model the relationship between the state of the system and its effect on failures \citep{chigurupati2017root}. Using it they were able to accurately predict all the possible root causes of a hardware failure in certain systems but this model required to predefine all the possible error modes by domain experts which isn't really possible in a constantly evolving distributed system. There were similar attempts \cite{gonzalez2017root} to use machine learning to generate knowledge graphs on historical data and help developers come up with reasoning to failures although this eliminated a need for a domain expert, this also can't react to unseen errors. + +% In a distributed system it's hard to spot real anomalies just by looking at monitoring data, but when there are huge spikes in response latencies or error rates it's a good indicator something must be wrong. So \cite{samir2019dla} used a \ac{hhmm} to uncover the possible affected services from changes in response time or error rates in one service and using that data to uncover the root cause of the issue. All of the papers discussed above have one problem in common they all assume the entire system is static but in reality, these services changes over time either with increased demand or new future implementations. To address this, \cite{wu2020microrca} developed a service that monitors all the running applications and their vitals. This also constructs an attributed graph that represents how each service interacts with the other. When the monitoring system detects an anomaly MicroRCA weight that graph with response time changes and tries to find the epicenter of the anomaly. The main problem with both of these approaches have is authors rely solely on slow response time as an indication of an anomaly but several other factors could course anomalous behaviors without changes in response times. diff --git a/proposal/sections/research/sections/research/aim.tex b/proposal/sections/research/sections/research/aim.tex new file mode 100644 index 0000000..de7285d --- /dev/null +++ b/proposal/sections/research/sections/research/aim.tex @@ -0,0 +1,8 @@ + +{\let\clearpage\relax\chapter{Research Aim}} + +\textit{The aim of this research is to design, develop and evaluate a toolkit to help system operators to reduce the \ac{mttr} when the system is experiencing an anomaly by using a machine learning model investigating all the services in the system and highlighting the most probable root causes in order, So the operators don't have to find a needle in a haystack.} + +To achieve this author tries to create a single model that can monitor all the vitals of a given service and output an anomaly score in any given time window. The author is hoping to make it generalized enough so operators can take the same model and deploy it with other services and the model will adopt the new services with \ac{fsl} \citep{wang2020generalizing}. To do this author is trying to create a data encoding technique to represent monitoring data in a programming language or framework independent way. + +Finally, the author is hoping to develop a playground that easily simulates a distributed system within a Kubernetes cluster so the create system can be tested and evaluated properly and future researches on this domain will have to benchmark framework to evaluate their work. diff --git a/proposal/sections/research/sections/research/challenge.tex b/proposal/sections/research/sections/research/challenge.tex new file mode 100644 index 0000000..da8be8d --- /dev/null +++ b/proposal/sections/research/sections/research/challenge.tex @@ -0,0 +1,10 @@ + +{\let\clearpage\relax\chapter{Research Challenge}} + +Even though this project seems very straightforward and easy to implement from a high level, but it becomes tricky when attempting to reach targets defined in the section \ref{sec:in-scope}. For example, interpretability was one most requested feature from industry experts and a must-have trait for mission-critical systems \citep{ribeiro2016should}. But it was left out of the project scope due to its complexity especially when it comes to an \textbf{undergraduate project}. Other than that following are a few of the more difficult challenges the author is expected to face while conducting the research.\\ + +\begin{itemize}[leftmargin=*] +\item \textbf{Highly seasonal and noisy patterns} - Monitoring metrics on microservices on production tends to have very unpredictable patterns depending on the traffic that's sent to the service. The amount of traffic sent will depend on several external factors that are hard to determine. Modeling both temporal dependencies and interdependencies between monitoring data into a single graph will be very difficult and require a lot of fine-tuning and data engineering. +\item \textbf{Overhead} - Modern deep learning models can solve any problem if we could give it an unlimited amount of data and processing power but In this case, models need to optimize for efficiency over accuracy since having a monitoring system that consumes a lot more resource than the actual target system isn't effective. +\item \textbf{Fit into Kubernetes eco-system} - Kubernetes has become the de-facto standard to managing distributed systems \citep{WhatisCo78:online}. So the author is planning to create a Kubernetes extension that will bridge the connection between monitored service and monitoring model as shown in the figure \ref{fig:high-level-diagram}. But Kubernetes itself has a very steep learning curve, even the original developers themselves admitted it's too hard complex for beginners \cite{Googlead4:online}. +\end{itemize} diff --git a/proposal/sections/research/sections/research/contribution.tex b/proposal/sections/research/sections/research/contribution.tex new file mode 100644 index 0000000..b1fe333 --- /dev/null +++ b/proposal/sections/research/sections/research/contribution.tex @@ -0,0 +1,11 @@ + +{\let\clearpage\relax\chapter{Research Contribution}} + + +\section{Domain Contribution} + +With this research, the author first tries to develop a \textbf{cloud-native solution to create a configurable microservices system}, So this research and future researches will have a standard environment to develop and evaluate their work. The author also hopes to build a lightweight and \textbf{low-overhead data collection pipeline} using \ac{ebpf} to collect telemetry of target services without any instrumentation from the user. + +\section{Knowledge Contribution} + +One of the main problems with monitoring microservices systems is different services can be developed with different programming languages and frameworks and those can contain different levels of noisiness\label{need-for-encoding}. So it's hard for a single model to detect anomalies in any service since some frameworks tend to use more resources while idle than others. So to address this author is trying to come up with an \textbf{encoding method} so the model can be trained to monitor one framework and those learning will still be valid for another framework. With those encoded data the author is hoping to develop a \textbf{convolutional autoencoder that will use unsupervised learning to spot out anomalies in a given data stream}. This may have better performance while using fewer resources convolutional layers are typically lightweight and good at pattern recognition \citep{oord2016wavenet}. Finally, the author is planning to aggregate those predictions from the models into a pre-generated service graph and weigh it to \textbf{find all possible root causes}. diff --git a/proposal/sections/research/sections/research/gap.tex b/proposal/sections/research/sections/research/gap.tex new file mode 100644 index 0000000..361fcad --- /dev/null +++ b/proposal/sections/research/sections/research/gap.tex @@ -0,0 +1,8 @@ + +{\let\clearpage\relax\chapter{Research Gap}} + +After a literature survey author came conclusion finding a root cause of any failure within a distributed system is a very difficult issue due to it not having single output we can try to predict and most researchers have built their own simulation of a distributed system by themselves since there isn't any open dataset about monitoring data mainly because it could contain sensitive information. + +Most currently established researches are done towards creating statistical models like clustering and linear regression. Even though these algorithms perform very well in small-scale systems, they struggle to keep up when the monitoring data become very noisy with scale. Another problem none of these papers properly addressed was constant changes to services. All most published research considers target services as static but in reality, these services can change even many times per day \citep{GoingtoM51:online}. + +After talking with industry experts author concluded three main issues all had with using a machine learning model as monitoring agent Reliability, Interpretability, and Tunability. On reliability, experts said too many false positives will make operators lose faith in the system because it's gonna be another distraction to them. As the operators have to take critical decisions with the output of these models, it has been interpretable by humans \citep{ribeiro2016should}. Finally, this system should act more like a tool rather than a replacement to human operators, because no matter machine learning models cannot compete with the context a human can handle. diff --git a/proposal/sections/research/sections/research/motivation.tex b/proposal/sections/research/sections/research/motivation.tex new file mode 100644 index 0000000..0811091 --- /dev/null +++ b/proposal/sections/research/sections/research/motivation.tex @@ -0,0 +1,5 @@ + +{\let\clearpage\relax\chapter{Research Motivation}} + +Modern distributed systems generate tons of useful and not so useful telemetry data. As the system grows in demand and size, these telemetry data only get nosier and complex \citep{Untangli35:online}. It's difficult for humans to make sense of all these data, especially if they don't have a lot of years of experience with the system. In the other hand, deep learning models thrive when it has a lot of data to learn from. As these models can be trained in computer-simulated environments they can learn concepts humans takes years to grasp within days \citep{OpenAI_dota, silver2017mastering}. Finally, unlike humans a deep learning model can monitor a service 24x7 without taking any breaks which will not only prevent outages even before they happen, It could be reduced \ac{mttr} because the issue can be detected way earlier than any human could do. + diff --git a/proposal/sections/research/sections/research/objective.tex b/proposal/sections/research/sections/research/objective.tex new file mode 100644 index 0000000..b5ec62c --- /dev/null +++ b/proposal/sections/research/sections/research/objective.tex @@ -0,0 +1,135 @@ + +{\let\clearpage\relax\chapter{Research Objectives}} + +\newcommand\robProblemIdentification{ +When selecting the problem author wanted to pursue, they had 3 main goals. +\begin{enumerate}[leftmargin=*,noitemsep,nolistsep] +\item The problem domain should be something they enjoy working in. +\item At the end of the research should have done a meaningful impact on the target domain, both in the theoretical and practical aspect, +\item It should be challenging to achieve and results should speak about themselves. +\vspace{-7mm} +\end{enumerate} +% After many iterations of trial and error the author settled on "Cloud Computing" as the domain, "Root cause analysis" as the problem because the author is a site reliability engineer by profession and quickly able to identifying the root cause of a failure could lower \ac{mttr}. +} + +\newcommand\robLiteratureReview{ +% After a general topic was identified, the author needed to do evaluate all the currently published work to understand what’s the current state of the problem and how other researchers and developers are approaching this problem. After an intensive literature survey author was able to identify a new angle to approach the domain. + +% During this period author contacted few experts in the cloud computing domain and evaluate the idea and plan for the project. +Conduct a Literature review on root cause analysis to, +\begin{itemize}[leftmargin=*,noitemsep,nolistsep] +\item To find the current methods used to anomaly detection and localization. +\item Uncover issues with current approaches. +\item Understand how advancement in other related domains can apply to this domain. +\vspace{-7mm} +\end{itemize} +} + + +\newcommand\robDevelopingEvaluation{ +During the literature survey, one problem the author identified was there isn’t a uniform dataset when it comes to training and evaluating models to detect anomalies in microservices. Most of the researchers used private datasets to train and test their work. +To address this author is developing, +\begin{itemize}[leftmargin=*,noitemsep,nolistsep] +\item A tool that can easily simulate a distributed system in a cloud-native setting. +\item A tool inject anomalies into the running services. +\vspace{-7mm} +\end{itemize} +} + +\newcommand\robPublishPlayground{ +The author is hoping to publish a paper about the above-mentioned tool so the future researchers will have a unified way to train, test, and benchmark their system without having to reinvent the wheel again and again. +} + +\newcommand\robDataGathering{ +% The author plans to use the above-mentioned tool to simulate a large-scale distributed system made up of services done in different frameworks and subject it to a load test. Then collect the monitoring data from that to train the model. +In order to create model to detect anomalies the author will, +\begin{itemize}[leftmargin=*,noitemsep,nolistsep] +\item Simulate distributed system. +\item Simulate traffic inside the system +\item Collect monitoring data while it's running +\vspace{-7mm} +\end{itemize} +} + +\newcommand\robDevelopingEncoding{ +As mentioned in the section \ref{need-for-encoding} these services will report very different values even at idle. To normalize data from all the services to one format author will, +\begin{itemize}[leftmargin=*,noitemsep,nolistsep] +\item Evaluate current data encoding methods like \cite{zhang2019deep}. +\item Find the best one fit and optimize it to this use case. +\item Test if there is any improvement by using this method. +\vspace{-7mm} +\end{itemize} + +% So there needs to be a way to normalize data from all the services to one format so the model can generalize for all the services no matter the framework it was built on. Inspired by \cite{zhang2019deep} the author is trying to develop or adopt an encoding technique to present data in an image-like structure so both ML models and humans can spot out anomalies easily. +} + + +\newcommand\robDevelopingModel{ +% Autoencoders have been outperforming all other types of models \citep{kumarage2019generative} when it comes to anomaly detection. Since this project already has a module that converts raw data to an image-like structure the author is hoping to use a convolution autoencoder which will be lighter and has the potential to outperform normal autoencoders when paired with the above data encoding technique. +According to \cite{kumarage2019generative} Autoencoders tend to perform best when it comes to anomaly detection. But during the literature survey it was raveled Conversational Autoencoders weren't tested. So author tries to develop a Conversational Autoencoders and test how it will perform. +} + + +\newcommand\robTesting{ +Following things will be tested during the testing phase, +\begin{itemize}[leftmargin=*,noitemsep,nolistsep] +\item How will the system classify long-term fluctuations. +\item How will the system classify short-term fluctuations. +\item Can the system understand the mapping between core metrics like CPU and Memory usages. +\item Accuracy of fault detection. +\item Accuracy of root cause localization. +\vspace{-7mm} +\end{itemize} +} +% The author hopes to carry an extensive evaluation on the system with a wide variety of edge cases and the author is hoping to see how the model identifies both short-term and long-term fluctuations and whether it can properly find a mapping between core vitals like CPU and Memory usages. + + +\newcommand\robIntegration{ +Having a fancy model doesn’t add means anything if it’s very hard to use in a real system. So the author is hoping to develop a Kubernetes extension that will map the model with any service given by the user. +} + + +% \begin{table}[] +% \setlength\LTleft{-5mm} +\begin{longtable}{|p{38mm}|p{95mm}|p{17mm}|} +% \begin{longtable}{|p{40mm}|p{100mm}|p{20mm}|} +\hline +\textbf{Research Objectives} & \textbf{Explanation} & \textbf{Learning Outcome} \\ \hline +Problem identification & \robProblemIdentification & LO1 \\ \hline +Literature review & \robLiteratureReview & LO3, LO4, LO6 \\ \hline +Developing an evaluation framework & \robDevelopingEvaluation & LO7 \\ \hline +Publish a paper about that playground & \robPublishPlayground & LO7 \\ \hline +Data gathering and analysis & \robDataGathering & LO7 \\ \hline +Developing encoding method & \robDevelopingEncoding & LO2, LO5, LO7 \\ \hline +Developing the model & \robDevelopingModel & LO2, LO5, LO7 \\ \hline +Testing and evaluation & \robTesting & LO8, LO9 \\ \hline +Integration & \robIntegration & LO7 \\ \hline +\caption{Research objectives} +\end{longtable} +% \setlength\LTleft{0mm} +% \end{table} +% \subsection{Project Objectives} +% \begin{itemize} +% \item Find area I am interested in +% \item Find a issue in it +% \item Evaluate the issue with experts in the field +% \item Create PID +% \item Create playground to test the final product in +% \item Publish a paper about that playground +% \item Finalize on requirements +% \item Develop prototype and Document the progress +% \item Create operator to plugin any ML model to monitor +% \item Publish paper with results +% \item complete thesis +% \item conclude the project +% \item open source the code +% \end{itemize} + +% \subsection{Research Objectives} +% \begin{itemize} +% \item Literature Survey +% \item Requirement Analysis +% \item Design +% \item Development +% \item Testing +% \end{itemize} diff --git a/proposal/sections/research/sections/research/question.tex b/proposal/sections/research/sections/research/question.tex new file mode 100644 index 0000000..2a85bdb --- /dev/null +++ b/proposal/sections/research/sections/research/question.tex @@ -0,0 +1,17 @@ + +{\let\clearpage\relax\chapter{Research Question}} + + +\begin{enumerate}[leftmargin=*,label=\textbf{RQ\arabic*:}] + +\item How can a machine learning model improve \ac{mttr} in a distributed system? + +\item What is the most efficient way to present raw data monitoring to machine learning model? + +\item What will be the most ideal machine learning model to uncover anomalies in a microservice? + +\item What are the methods that can be used to evaluate a root cause prediction system? + +\end{enumerate} + +