BlogParser.html

<!DOCTYPE html>
<html>
<!--
  BlogParser.html
-->
<head>
  <title>Blog Parser</title>
  <meta name="viewport" content="width=device-width, initial-scale=1" />
  <!-- <link rel="icon" type="image/x-icon" href="./images/favicon.ico" /> -->
  <link rel="stylesheet" href="css/StylesPhoto.css" />
  <link rel="stylesheet" href="css/StylesSizerComp.css" />
  <!-- PageFrame infrastructure -->
  <link rel="stylesheet" href="css/StylesPageFrameDefaults.css" />
  <link rel="stylesheet" href="css/StylesPageFrameStructure.css" />
  <link rel="stylesheet" href="css/StylesPageFrameMenus.css" />
  <link rel="stylesheet" href="css/StylesPageFrameThemePython.css" />
  <link rel="stylesheet" href="css/StylesWebComponents.css" />
  <script src="js/ScriptsWebComponents.js"></script>
  <!--<script src="js/ScriptsPageFrameDefaults.js"></script>-->
  <script src="js/ScriptsPageFramePosts.js"></script>
  <script src="js/ScriptsPageFramePagesPosts.js"></script>
  <script src="js/ScriptsPageFrameKeyboard.js"></script>
  <script src="js/ScriptsSizerComp.js"></script>
  <!-- No need for Pages script for pages with no next or prev pages -->
  <!--<script src="js/ScriptsPageFramePages.js"></script>-->
  <!-- <script src="js/ScriptsTemplate.js"></script>
  <link rel="stylesheet" href="css/StylesTemplate.css" /> -->
  <style>
    h3 {
      margin-top: 1.5em;
    }

    #subtitle {
      margin-top: 0.4em;
      margin-bottom: 0.3em;
    }

    #github header summary {
      border: 1px solid var(--light);
    }

    #github summary {
      padding-right: 2em;
    }
    /* #github .menuHead {
          margin:0em -0.25em 0.0em -0.25em;
          padding:0.25em 0.5em;
        } */
  </style>
  <script>
    function load() {
      initialize();
      //loadif();
    }
  </script>
  <style>
    #github note {
      display: block;
      width:max-content;
      border:1px solid red;
      padding:0.5em 1.0em;
      margin:0.5em 0em;
    }
    #github .bargraph {
      border: 1px solid var(--dark);
      /* background-color: #bbb; */
      padding: 0.1em 0.5em;
      font-size:0.9em;
    }
    #github table {
      border:2px solid var(--dark);
    }
    #github table td {
      padding:0.25em 1.0em;
      border:none;
    }
    body {
      user-select:none;
    }
  </style>
  <script>
    function clickstat() {
      // prevent parent click event handling
      event.stopImmediatePropagation();
    }
  </script>
</head>
<body id="github" onload="load()" style="position:relative;">

  <a id="Next" href="BlogCodeAnalyzer.html">Next</a>
  <a id="Prev" href="BlogNoSql.html">Prev</a>

  <page-frame>
    <frame-header>
      <nav id="navbar"></nav>
    </frame-header>
    <main id="main">
      <div id="about" onclick="this.style.display = 'none'">about</div>
      <div id="page">Blog: Parser</div>
      <div id="modified">11/27/2024</div>
      <div id="hlp"></div>
      <a id="top"></a>
      <content style="height:100vh; position:relative;">
        <header style="cursor:pointer;" onclick="loadif()">
          <!-- <a target="_blank" class="repoLink" href="https://github.com/JimFawcett">github Repositories</a> -->
          <hgroup id="pagetitle" style="border: 2px solid var(--dark);">
            <h1 id="title">Blog: Parser</h1>
            <h3 class="indent" id="subtitle">
              toker, semiExp, parser, rules, actions
            </h3>
          </hgroup>
          <!-- <img style="width:100%; margin:-0.1em 0em; border:2px solid var(--dark); padding:0.5em; background-color:var(--light);" src="Pictures/officestrip3a.svg" /> -->
          <div class="darkItem" onclick="loadif()" style="cursor:pointer; position:relative; padding:0.0em 0em 0.25em 0em; margin-top:-0.50em; border:2px solid var(--dark);">
            <a class="repoLinks" target="_blank" href="https://github.com/JimFawcett" style="color:var(--atten); margin-left:1.5em;">About</a>
            <div style="font-size:0.9em; position:absolute; top:0.1em; right:1.5em;">click to toggle Site Explorer</div>
            <div style="height:0.5em;"></div>
          </div>
        </header>

        <h3>Initial Thoughts:</h3>
        <t-b>
          Parsing is the process of discovering and classifying the parts of some complex thing.  Our interests are in parsing computer languages and particularly C, C++, Java, and C#.
          In this context parsing is the process of some form of syntactic analysis, which may be based on a formal reduction using some representation like <a target="_blank" href="http://en.wikipedia.org/wiki/BNF_grammar">BNF</a>,
          or using some Ad-Hoc process.
        </t-b>
        
        <t-b>
          There are a lot of reasons you may wish to parse source code beyond compiling it's text.  For example:
          <ul class="tights">
            <li>
              Building code analysis tools
            </li>
            <li>
              Searching for content in or ownership of code files
            </li>
            <li>
              Evaluating code metrics
            </li>
            <li>
              Compiling &quot;little embedded languages&quot;
            </li>
          </ul>
          Many code parsers have been written including: <a target="_blank" href="http://en.wikipedia.org/wiki/ANTLR">ANTLR</a>, <a target="_blank" href="http://en.wikipedia.org/wiki/GNU_bison">bison</a>,
          <a target="_blank" href="http://en.wikipedia.org/wiki/Lex_(software)">Lex</a>, and <a target="_blank" href="http://en.wikipedia.org/wiki/Spirit_Parser_Framework">Spirit</a>.  There is a long history of
          successful use of some of these, so why would we consider writing yet another parser?
        </t-b>
        <t-b>
          Using existing parsers for the fairly small tasks in which we are interested seems like killing flies with a sledge hammer - to much work and not enough reward.
          Our goals are to build a facility that is quick to deploy, can be easily ported to different platforms, and for which the parsing model can be built incrementally
          as we learn more about the work we are trying to accomplish.
        </t-b>
      
        <indent-block>
          <div style="width:calc(100vw - 9rem);"><div id="fig1"></div></div>
        </indent-block>
      
        <h3 id="parser">A Rule-based Parser built in C++</h3>
        <t-b>
          Several years ago I built a prototype to illustrate some design ideas 
          for one of my graduate classes and to serve as help for a code analysis project that I
          assigned to them.  
        </t-b>
        <t-b>
          The parser had to be simple enough and easy enough 
          to use for students to understand it and incorporate successfully into their projects 
          in a week or two and still get the project assignment turned in on time.
        </t-b>
        <t-b>
          That prototype has since been used in a couple of Doctoral research projects and by many 
          of my graduate classes on a variety of projects.  We've found it to be an effective facility 
          for learning language structure in the classroom and building research tools in the lab.
        </t-b>
      
        <h3 id="design">Design:</h3>
        <t-b>
          The Parser's logical structure is shown in the Parsing Facility diagram, here.  It has a structure provided by the four blocks shown.  At the bottom is a
          scanner with the responsibility to consume a string or text file and return a stream of token collections.
        </t-b>
        <t-b>
          Each token is a text word or group of punctuators.
          Some tokens are constrained to consist of a single instance of specialized characters like braces, brackets, and such.  Often the transition between characters
          and white space are taken as token boundaries, but many special cases have been incorporated over years of use.
        </t-b>
        <t-b>
          The tokenizer collects quoted strings and comments as
          single tokens, independent of their contents.  You can ask the tokenizer to return or to throw away comment tokens.  Aside from comments and white space, the tokenizer
          promises to return all the input stream's characters, in the order provided by the source.  What it does is to segregate them into words called tokens. Thus it
          removes all textual structure of the source while preserving it's compilable information.
        </t-b>
        <t-b>
          For code analysis we collect tokens into grammatical sequences, called SemiExpressions, by terminating the collection on semicolons, curly braces, or newlines
          if the line starts with &quot;#&quot;.  Each of these collections is usually sufficient to detect a single grammatical construct without including tokens from
          the next grammatical entity.  If we are parsing XML text then we use a package called XmlParts which has similar responsibilities to the SemiExpression package.
        </t-b>
        <t-b>
          The Parser package uses the scanner to collect token sequences for analysis.  It is essentially a container of rule detectors that implement an IRule interface.
          Parser simply feeds the current token sequence to each rule in an internal rule container.  When that is complete it requests another token sequence and repeats
          until there is nothing more to collect.
        </t-b>
        <t-b>
          The Parser doesn't need to know anything about how its token sequences are collected nor how a rule will handle the sequence.  It is simply a traffic cop that
          supplies the rules with what they need.  Each rule has a collection of actions.  When the rule is satisfied by a token collection it invokes its actions.  It
          is up to the action to decide what to do with the information contained in the token sequence when its rule fires.  Each action is handed a reference to a
          data repository to store and retrieve information to carry out its task.  Note that the rules don't need to know what the actions do and the actions don't
          even know the rules exist.  They just do their thing when asked.
        </t-b>
        <t-b>
          For each parsing application, the only things that need to change are the rules and actions, the ConfigureParser builder that assembles the analyzer's parts,
          the Repository and Display.  All the complex parts - tokenizer, SemiExpression, and Parser don't need to change at all. The rules and actions will probably be
          modeled after already existing rules and actions, so reuse in a new application is fairly straight forward.
        </t-b>
        <div class="clear"></div>
        <div style="width:calc(100vw - 9rem)"><div id="fig2"></div></div>
      
        <h3 id="output">Typical Output:</h3>
        <t-b>
          We show, in the command window, the results of running a simple rule set in the parser.  This set has just a few rules that detect when a function is defined,
          record the starting line number, and keep track of the current scope, using the ScopeStack shown in the class diagram near the Repository.
          When the code sequence leaves the function's scope that line number is also recorded, so we can tell how many text lines are used by the function's
          definition.
        </t-b>
        <t-b>
          Our research group has used the Parser as the analysis engine for dependency analysis, code restructuring, and some code visualization.  This Parser
          design has allowed us to focus on asking and answering questions about code rather than the details of its syntactical analysis.
        </t-b>
      
        <h3 id="code">Source Code:</h3>
        <t-b>
          This parser implementation is written in C++ using Visual Studio 2012.  It should port, with little difficulty, to Gnu gcc to run on Linux, for example.
          You will find several projects in the VS solution.  You may wish to start with the Parser project which runs out of a test stub main in the parser package.
          <div class="indent" style="margin:10px;"><a target="_blank" href="https://github.com/JimFawcett/CppParser">Parser Code</a></div>
          This code bears a copyright &copy; that grants all rights to users except the right to publish and requires retention of the copyright notice.
        </t-b>
      
        <h3 id="contrib">Contributions</h3>
        <t-b>
          One of my doctoral advisees, Dr. Murat Gungor (completed 2006), built the first rule set that analyzes the complete C++ scope tree.  Murat used that
          to investigate the structure of a large software system, looking for structural defects.  This <a target="_blank" href="..\research\PapersWithMurat\DepAnal4SERP05_Ver10-SER3092.pdf">work</a>
          was published at SERP05.  A current doctoral
          advisee, Mehmet Kaya, has built a rule base, also for the C++ scope tree, that is used with the most recent version of the parser.  He has been using
          that to develop new ways of restructuring C++ source code to improve its structure and maintainability.  Here is a Syracuse University
          <a target="_blank" href="http://surface.syr.edu/eecs_techreports/73/">Technical Report</a> describing that work.
        </t-b>
      
        <h3>Conclusions:</h3>
        <t-b>
          Of hundreds of design and implementation projects I've done over many years, this Parser is the one of which I am most pleased.
          It does a fairly complex job with simple parts.  It uses interfaces: ITokenCollection, IRule, IAction, and IBuilder to
          decouple all the important parts so the design is very robust under change. It serves as a nice illustration of polymorphism and the four object-oriented class
          relationships: inheritance, composition, aggregation, and using.  And, it has been used successfully in several research activities and many classroom projects.
        </t-b>
        <div style="height:1.0em;"></div>
        <div style="margin-bottom:10px;">
          <img class="photo" src="Pictures/facultyCenterStrip.jpg" alt="Newhouse" style="width:100%;" />
        </div>
        <div style="height:1.0em;"></div>
        <a id="bottom"></a>
      </content>
      <page-TOC id="pages" style="display:none;">
      </page-TOC>
      <page-sections id="sections" style="display:none;">
        <menu-elem style="width:0.0em">&nbsp;</menu-elem>
        <menu-elem class="secElem"><a href="#bottom">bottom</a></menu-elem>
        <menu-elem class="secElem"><a href="#contrib">contrib</a></menu-elem>
        <menu-elem class="secElem"><a href="#code">code</a></menu-elem>
        <menu-elem class="secElem"><a href="#output">output</a></menu-elem>
        <menu-elem class="secElem"><a href="#design">design</a></menu-elem>
        <menu-elem class="secElem"><a href="#parser">parser</a></menu-elem>
        <menu-elem class="secElem"><a href="#top">top</a></menu-elem>
        <div class='darkItem popupHeader' style="padding:0.25em 2.0em;" onclick="this.parentElement.style.display='none'">Sections</div>
      </page-sections>
    </main>
    <frame-footer>
      <menu-item style="width:2.0em;">&nbsp;</menu-item>
      <menu-elem id="nextLink2" onclick="bottomMenu.next()">Next</menu-elem>
      <menu-elem id="prevLink2" onclick="bottomMenu.prev()">Prev</menu-elem>
      <menu-elem id="pgbtn" onclick="bottomMenu.pages()">Pages</menu-elem>
      <menu-elem onclick="bottomMenu.sections()">Sections</menu-elem>
      <menu-elem onclick="bottomMenu.about()">About</menu-elem>
      <menu-elem id="kysbtn" onclick="storyHlpMenu.keys()">Keys</menu-elem>
      <menu-elem style="margin-right:1em">
        <span id="loc" style="display:inline-block; font-weight:normal"></span>
      </menu-elem>
    </frame-footer>
  </page-frame>
  <script>
    createSizer("Pictures/ParserStaticStructure.png", "Fig 1. Parser Static Structure", 600, "fig1");
    createSizer("Pictures/ParserOut.png", "Fig 2. Parser Demo Output", 400, "fig2");
  </script>
  <script>
    let loc = document.getElementById("loc");
    let fn = window.location.href.split(/\/|\\/).pop();
    loc.innerHTML = fn + ":";
  </script>
</body>
</html>