presentation.html

<html lang="en">

	<head>
		<meta charset="utf-8">

		<title>A Free/Open-Source Morphological Transducer for Western Armenian</title>

		<meta name="description" content="A Free/Open-Source Morphological Transducer for Western Armenian">
		<meta name="author" content="Hovsep Dolatian and Daniel Swanson and Jonathan N. Washington">

		<meta name="viewport" content="width=device-width, initial-scale=1.0">
		<!-- , maximum-scale=1.0, user-scalable=no" -->

		<link rel="stylesheet" href="./reveal.js/dist/reset.css">
		<link rel="stylesheet" href="./reveal.js/dist/reveal.css">
		<link rel="stylesheet" href="./reveal.js/dist/theme/beige.css" id="theme">

		<!-- Code syntax highlighting -->
		<link rel="stylesheet" href="./reveal.js/plugin/highlight/monokai.css" id="highlight-theme">
		<!-- link rel="stylesheet" href="./reveal.js/lib/css/zenburn.css" -->

		<!-- Printing and PDF exports -->
		<script>
			var link = document.createElement( 'link' );
			link.rel = 'stylesheet';
			link.type = 'text/css';
			link.href = window.location.search.match( /print-pdf/gi ) ? './reveal.js/css/print/pdf.scss' : './reveal.js/css/print/paper.scss';
			document.getElementsByTagName( 'head' )[0].appendChild( link );
		</script>

		<!--[if lt IE 9]>
		<script src="lib/js/html5shiv.js"></script>
		<![endif]-->

		<link rel="stylesheet" href="jnw-overrides.css">
	</head>

	<body>

		<div class="reveal">
			<div class="slides">

				<section id="title" data-state="no-toc-progress">
					<style>.header1 header:after { content: "Header 1 Example"; }</style>
					<h1><div style="margin-bottom: 0.5em !important ;">A Free/Open-Source Morphological Transducer for Western Armenian</div></h1>
					<div style="padding-top:0in; padding-bottom: 0.25in;">
						<!-- h3 -->
						<div style="width: 100%; padding-bottom: 0.25in;">
							<div style="width: 32%; float: left;">
								<img style="width: 5.5em; height: 5.5em;" src="img/pic_HD.jpg"/><br />
								Hossep Dolatian¹
							</div>
						<!-- /h3-->
							<div style="width: 32%; float: left;">
								<img style="width: 5.5em; height: 5.5em;" src="img/pic_DS.jpg"/><br />
								Daniel Swanson²
							</div>
							<div style="width: 32%; float: left;">
								<img style="width: 5.5em; height: 5.5em;" src="img/pic_JNW.jpg"/><br />
								Jonathan&nbsp;N.&nbsp;Washington³
							</div>
							<div style="clear: both"></div>
						</div>
						<ol class="shrink">
							<li>Stony Brook University, USA</li>
							<li>Indiana University, USA</li>
							<li>Swarthmore College, USA</li>
						</ol>
					</div>
					<div style="background-color: white;">
						<img class="noframe" src="img/sb_logo.jpg" style="height: 5em;"/>
						<img class="noframe" src="img/iu_trident_web_crimson.svg" style="height: 5em;"/>
						<img class="noframe" src="img/swarthmorelogowhite.svg" style="height: 5em;"/>
						<img class="noframe" src="img/Apertium_logo_lightbg.svg" style="height: 5em;"/>
					</div>
				</section>


				<section>
					<h2>Overview</h2>
					<div class="float-left">
					<div class="box">
						<p class="bbb">Our talk in a nutshell</p>
						<p>We developed a prototype <span class="highlight">free/open source</span> morphological transducer for <span class="highlight"/>Western Armenian</span></p>
						<!-- ul -->
							<!-- li>Turkic non-finite verb forms often argued to be multifunctional</li -->
							<!-- li>Commonly assumed multifunctionality of Turkic non-finite verb forms does not hold</li>
							<ul>
								<li>Conflates important distinctions</li>
								<li>Too few categories</li>
							</ul>
							<li>Proposal for novel delineation</li>
						</ul -->
					</div>
					</div>
					<div class="float-right"><!--  style="margin-top: 1em;" --> <!-- #padding-top: 10pt     | to make the top vertically align with the other box -->
					<div class="box">
					<p class="bbb">The next ~12 minutes:</p>
					<ul>
						<li>Background</li>
						<ul>
							<li>Western Armenian</li>
							<li>Morphological tranducers</li>
						</ul>
						<li>Implementation and Challenges</li>
						<li>Evaluation</li>
						<li>Future work</li>
						<li>Conclusion</li>
					</ul>
					</div>
					</div>
				</section>

				<section>
					<h2>Background</h2>
					<h3>Western Armenian</h3>
					<!-- <img src="img/Sakha.png" style="height: 10em;"><br /> -->
					<ul>
						<li>IE language</li>
						<li>~1.6 million  speakers</li>
						<li>Spoken primarily in diaspora</li>
						<li>Status varies from viable to endangered, based on location</li>
            <li>Morphological typology
            <ul> 
            <li> Primarily suffixing and agglutinative
            <li> Multiple conjugation classes and declension classes
            <li> Has root suppletion and stems
            </ul> </li>
					</ul>
				</section>


				<!-- 
<section data-background="img/map.svg" data-background-position="left center" data-background-size="contain">
					<div style="background-color: rgba(0.3, 0.3, 0.3, 0.4); transparency 50%; position: absolute; top: 0px; right: -7em; margin: 0 0 0 0; padding: 0.5em 0.5em 0.5em 0; color: black">
						<p>Might not need the map</p>
						<ul style="list-style-type: none;   text-shadow: 2px 2px 8px #FFFFFF;">
							<li>List:</li>
							<ul>
								<li><span class="code">hye</span> - Eastern Armenian</li>
								<li><span class="code"><b class="highlight-cyan" style="text-shadow: 2px 2px 8px #000000;">hyw</b></span> - <b class="highlight-cyan" style="text-shadow: 2px 2px 8px #000000;">Western Armenian</b></li>
							</ul>
						</ul>
					</div>
				</section>
 -->
				<section data-auto-animate>
					<span data-id="backgroundFSTs"><h2>Background</h2>
					<h3>Morphological tranducers</h3></span>
					<ul>
						<li>Twofold function: morphological analysis and generation<br />
						<div class="center">
							<span class="code" style="line-height: 1.5 !important;">սիրեցին <span class="highlight">↔</span> սիրել<tag>v</tag><tag>tv</tag><tag>past</tag><tag>pret</tag><tag>p3</tag><tag>pl</tag><tag>indc</tag></span><br />
							<span class="center small ipa" style="line-height: 1.5 !important;">[siɾeʦin] ‘<em>they liked</em>’ <span class="highlight">↔</span> [siɾel] ‘<em>to like</em>’, verb, intransitive, past, preterite, 3rd person, plural, indicative</span>
						</div>
						</li>
						<li>Implemented as finite state transducers (FST)</li>
						<li>Compiled from <span class="highlight">hand-coded</span> lexical, morphotactic, and morphophonological generalisations</li>
						<li>Only one development cycle</li>
						
					</ul>
				</section>
				<section data-auto-animate>
					<span data-id="backgroundFSTs"><h2>Background</h2>
					<h3>Morphological tranducers</h3></span>
					<ul>
						<li><span class="highlight">many uses</span> in language technology and “downstream” tasks:</li>
						<ul>
							<li>can be repurposed as spell checkers</li>
							<li>may be used in rule-based machine translation pipelines</li>
						</ul>
						<li>Some <span class="highlight">current uses</span> of this transducer:</li>
						<ul>
							<li>Verb paradigm generator <span class="small">(<a href="https://apertium.github.io/apertium-paradigmatrix/">apertium.github.io/apertium-paradigmatrix</a>)</span></li>
							<li>Virtually-complete morphology helps find synchronic variation, and </li>
							<li>Useful when writing a reference grammar</li>
							<li>Looking into pedagogical uses</li>
						</ul>
					</ul>
				</section>

				<section> 
					<h2> Number of lemmas</h2>
					<div class="float-left">
					<table class="shrink nolines">
						<tr><th></th><th>category</th><th>entries</th><th>tag</th></tr>
						<tr class="addline-top"><th rowspan="4" class="rot90"><div>Core POS</div></th><td>Noun</td><td class="raln">39 006</td><td><tag>n</tag></td></tr>
						<tr><td>Adjective</td><td class="raln">18 617</td><td><tag>adj</tag></td></tr>
						<tr><td>Verb</td><td class="raln">7 441</td><td><tag>v</tag></td></tr>
						<tr><td>Adverb</td><td class="raln">1 895</td><td><tag>adv</tag></td></tr>
						<tr class="addline-top"><th rowspan="4" class="rot90"><div>Names</div></th><td>Given name</td><td class="raln">4 848</td><td><tag>np</tag><tag>ant</tag></td></tr>
						<tr><td>Surname</td><td class="raln">2 052</td><td><tag>np</tag><tag>cog</tag></td></tr>
						<tr><td>Location&nbsp;name</td><td class="raln">1 183</td><td><tag>np</tag><tag>top</tag></td></tr>
						<tr><td>Other name</td><td class="raln">22</td><td><tag>np</tag><tag>al</tag></td></tr>
						<tr class="addline-top"><th rowspan="7" class="rot90"><div>Function, other</div></th><td>Pronoun</td><td class="raln">415</td><td><tag>prn</tag></td></tr>
						<tr><td>Adposition</td><td class="raln">130</td><td><tag>pr</tag>,&nbsp;<tag>post</tag></td></tr>
						<tr><td>Abbreviation</td><td class="raln">81</td><td><tag>abbr</tag></td></tr>
						<tr><td>Interjection</td><td class="raln">49</td><td><tag>ij</tag></td></tr>
						<tr><td>Conjunction</td><td class="raln">48</td><td><tag>conj</tag></td></tr>
						<tr><td>Numeral</td><td class="raln">41</td><td><tag>num</tag></td></tr>
						<tr><td>Particle</td><td class="raln">9</td><td><tag>particle</tag></td></tr>
						<tr class="addline-top addline-bottom"><td></td><th class="laln">Total</th><td class="raln">75 837</td><td></td></tr>
						
					</table>
					</div>
					<!-- div style="float: center;">
								<img   src="img/lexicon.jpg"/><br />
					</div -->
					<div class="float-right">Recognizes/generates<br/><span class="highlight">millions</span> of<br/>inflected forms </div>
				</section> 
     <!-- 
   <section>
            <h2>Existing transducers for comparison</h2>
            <h3>Comparison of lexicon size</h3>
				<div>hyw: &gt;999 999 stems</div>
        </section>
 -->

<section>
            <h2>Implementation</h2>
				<ul class="nobullet">
					<li>Two-level approach using <tt>lexd</tt>+HFST:</li>
					<ul class="shrink">
						<li>“diacritics” in lexical form, like <code><arch>Plն</arch></code></li>
						<li>used in morpholophonology to select phonologically-conditioned allomorphs<br/>(e.g., of the plural and definite)</li>
					</ul>
				</ul>
				<!-- The lexical form uses diacritics to later pick phonologically-conditioned allomorphs of the plural and definite -->
				<div class="box" style="width: 100%">
					<h4>orthographic form</h4>
					<div style="margin-bottom: 0.5em !important;"><span class="code">ձեռքերէն</span></div><span class="ipa small">/ʦeɾkʰ-eɾ-e-n/<br/>‘<em>from the hands</em>’</span>
				</div>
				<p>↕ (<code class="highlight">twol</code>: phonological mappings)</p>
				<div class="box" style="width: 100%">
					<h4>morphological/lexical form</h4>
					<span class="code">ձեռք&gt;<arch>Ninfl</arch>&gt;<arch>Plն</arch>եր&gt;է&gt;<arch>defն</arch></span>
				</div>

				<p>↕ (<code>lexc</code>: lexicon + morphotactics)</p>
				<div class="box" style="width: 100%">
					<h4>analysis: <span class="highlight">lemma, POS, grammatical tags</span></h4>
					<span class="code">ձեռք<tag>n</tag><tag>pl</tag><tag>abl</tag><tag>def</tag></span>
				</div>
			
			</section>

			   <section>
            <h2>Challenges: Lexicon compilations</h3>
				<div class="box">
					<h3>Problem:</h3>
					<ul>
						<!-- li> For a compound with monosyllabic second stem, need to manually check if the semantics use plural  -եր /-eɾ/ or  -ներ /-neɾ/ </li -->
						<!-- li>There are many Armenian dictionaries online, but essentially none of them provide data formats (text files) that allow for intensive use </li -->
						<li>Many online Armenian dictionaries, but data formats don't allow for intensive use</li>
						<li>Ideal: text files or accessible databases</li>
					 
					</ul>
				</div>
				<div class="box">
					<h3>Solution: </h3>
					<ul>
						<li>Scraped XML files from the Kouyoumdjian dictionary from 
<a href="nayiri.com/">Nayiri</a> </li> 
						<li>Extracted a 70K lexicon that we used to populate our analyser</li>
						<li>Unfortunately Nayiri doesn't publicly offer their XML anymore :( </li> 
						<li>But you can use our adaptation of it from our <a href="https://github.com/apertium/apertium-hyw/tree/master/lexd_components">GitHub</a> :) </li> 
					</ul>
				</div>
				<aside class="notes">
				<p class="shrink">What to read, in presenter notes</p>
				</aside>
			</section>

        <section>
            <h2>Challenges: compound plurals </h2>
				<div class="box">
					<h3>Problem:</h3>
					<ul>
						<!-- li> For a compound with monosyllabic second stem, need to manually check if the semantics use plural  -եր /-eɾ/ or  -ներ /-neɾ/ </li -->
						<li>Compound with monosyllabic second stem:<br/>Is plural -եր /-eɾ/ or -ներ /-neɾ/?</li>
						<ul>
						<li> 						
						<span class="code">դրամատուներ ↔ դրամատուն<tag>n</tag><tag>pl</tag></span>
						
					<div class="ipa">[tʰəɾɑmɑdun-eɾ]  ↔ [tʰəɾɑmɑdun] ‘<em>money-houses</em>’</div> 


						</li> 
					<li> 
												<span class="code">մեծատուններ ↔ մեծատուն<tag>n</tag><tag>pl</tag></span>

											<div class="ipa"> [med͡zɑdun-neɾ]  ↔ [med͡zɑdun] ‘<em>big-housed</em>’</div> 

						</li> </ul>
						<li>Need to manually code based on semantics</li>
					</ul>
				</div>
				<div class="box">
					<h3>Solution: </h3>
					<ul>
						<li>Double-checked against EANC to get the right semantics</li> 
					</ul>
				</div>
				<aside class="notes">
				<p class="shrink">What to read, in presenter notes</p>
				</aside>
			</section>

  <section>
            <h2>Challenges: infix punctuation  </h2>
				<div class="box">
					<h3>Problem:</h3>
					<ul> <li> Punctuation markers are added after vowel of stressed syllable of word<br />
					
					
					<table class="nolines">
  <tr>
    <th>Final stress</th>
    <th>Pre-schwa stress</th>
    <th>Irregular stress</th>
  </tr>
  <tr>
    <td>կապի՞կ</td>
    <td>կապի՞կը</td>
    <td>ո՞րքան</td>
  </tr>
  <tr>
    <td class="ipa">[ɡɑˈbi<sup>?</sup>ɡ]</td>
    <td class="ipa">[ɡɑˈbi<sup>?</sup>ɡə]</td>
    <td class="ipa">[ˈvo<sup>?</sup>ɾkʰɑn]</td>
  </tr>
  <tr class="addline-bottom">
    <td>‘<em>monkey?</em>’</td>
    <td>‘<em>the monkey?</em>’</td>
    <td>‘<em>how many?</em>’</td>
  </tr>
</table> </li> </ul>
				</div>
				<div class="box">
					<h3>Solution: </h3>
					<ul>
						<li>Have rules that add infix at end of word, and then “metathesise” (actually: delete and copy)</li>
						<li>Add control characters at location of irregular stress</li>
					</ul>
				</div>
				<aside class="notes">
				<p class="shrink">What to read, in presenter notes</p>
				</aside>
			</section>
			
			<section>
				<h2>Evaluation: Coverage</h2>
				<div class="float-left">
				<div class="box">
					<ul>
						<li><span class="highlight">Naive coverage</span>: <em>the number of forms in a corpus that receive an analysis, regardless of whether or not the analysis is correct (e.g., in context)</em></li>
					</ul>
				</div>
				</div>
				<div class="float-right" style="margin-top: 1em">
            <table>
                <tr>
                    <th>Corpus</th>
                    <th>Tokens</th>
                    <th>Coverage</th>
                </tr>
                <tr>
                    <td>Newspapers</td>
                    <td class="raln">~16M</td>
                    <td class="raln">87.68%</td>
                </tr>
                <tr>
                    <td>Wikipedia</td>
                    <td class="raln">~2.59M</td>
                    <td class="raln">88.79%</td>
                </tr>
					 <tr>
					 	<td>New&nbsp;Testament</td>
						<td class="raln">~190K</td>
						<td class="raln">90.87%</td>
					</tr>
            </table>
				<p class="shrink">Adequate coverage: robust morphology, but many missing stems</p>
				</div>
        </section>

        <section>
            <h2>Evaluation: Precision & Recall</h2>
				<div>
				<ul>
					<li>created gold standard:</li>
	            <ul>
	                <li>1225 valid words of Western Armenian</li>
						 <li>randomly selected from Newspaper corpus</li>
						 <li>manually annotated output of transducer</li>
	            </ul>
				</ul>
				</div>
				<div class="float-left">
				<div class="box">
					<ul class="nobullet">
						<li style="margin-bottom: 1em;"><span class="highlight">precision</span>: <em>how many of the transducer-provided analyses for the tokens were correct<!-- percentage of analyses returned by the transducer that is correct --></em></li>
						<li><span class="highlight">recall</span>: <em>how many of the correct analyses were retrieved from the transducer<!-- percentage of the correct analyses that is returned --></em></li>
					</ul>
				</div>
				</div>
				<div class="float-right">
				<div>results:</div>
            <table>
                <tr>
                    <th>Tokens</th>
                    <th>Precision</th>
                    <th>Recall</th>
                </tr>
                <tr>
                    <td>1225</td>
                    <td class="raln">90.58%</td>
                    <td class="raln">74.82%</td>
                </tr>
            </table>
				<p class="shrink">i.e., most forms returned by the transducer were deemed correct, but many correct analyses were not returned by the transducer (mostly due to low coverage)</p>
				</div>
        </section>

			<section>
				<h2>Future work</h2>
				<ul>
					<li>Expand <span class="highlight">lexicon</span> (add more lemmas)</li>
					<li>Morphological/syntactic <span class="highlight">disambiguation</span></li>
					<li>More language technology applications of transducer? (spell checkers, MT, etc.?)</li>
					<li><em>Slowly</em> working on incorporating Eastern morphology so that the analyzer is <span class="highlight">bi-dialectal</span></li> 
					<ul class="nobullet">
						<li class="arrow">Simple MT between different varieties of Armenian</li>
					</ul>
					<li> Incorporate an orthography-pronunciation convertor</li> 
					<li> Honestly, it's open-source, so you can do WHATEVER you want with it! </li> 
				</ul>
			</section>

				<section>
					<h2>Conclusion</h2>
					<ul>
						<li>Expansion of language technology for Western Armenian</li>
						<li>Robust transducer</li>
						<ul>
							<li>high coverage</li>
							<li>high precision</li>
							<li>moderate recall</li>
						</ul>
						<li>Lots of room for improvement!</li>
						<li>Ready for use in</li>
						<ul>
							<li>language technology applications</li>
							<li>downstream tasks</li>
						</ul>
					</ul>
					<div class="box fragment">
						<p class="bbb"><span class="cyrillic">Շնորհակա՛լ ենք</span>! / Thank you!</p>
						<ul>
							<li>Fork at <a href="https://github.com/apertium/apertium-hyx">github.com/apertium/apertium-hyw</a></li>
							<li>Try out at <a href="https://beta.apertium.org/#analysis?aLang=hyx_hyw">beta.apertium.org</a></li>
						</ul>
					</div>
				</section>

			</div>
		</div>

		<!-- script src="./reveal.js/lib/js/head.min.js"></script -->
		<script src="./reveal.js/dist/reveal.js"></script>
		<script src="./reveal.js/plugin/notes/notes.js"></script>
		<script src="./reveal.js/plugin/markdown/markdown.js"></script>
		<script src="./reveal.js/plugin/highlight/highlight.js"></script>
		<script src="./reveal.js/plugin/zoom/zoom.js"></script>

		<script>

			// Full list of configuration options available here:
			// https://github.com/hakimel/reveal.js#configuration
			Reveal.initialize({
				controls: true,
				progress: true,
				center: true,
				history: true,
				hash: true,

				// The "normal" size of the presentation, aspect ratio will be preserved
				// when the presentation is scaled to fit different resolutions. Can be
				// specified using percentage units.
				width: 1280, //960,
				//height: 720, //700,
				height: 1024,
				//width: 960,
				//height: 700,

				// Factor of the display size that should remain empty around the content
				margin: 0.1,

				// Bounds for smallest/largest possible scale to apply to content
				minScale: 0.2,
				maxScale: 1.0,

				//theme: Reveal.getQueryHash().theme, // available themes are in /css/theme

				transition: 'slide', // none/fade/slide/convex/concave/zoom
				// transition: Reveal.getQueryHash().transition || 'fade', // default/cube/page/concave/zoom/linear/fade/none

				// Shows the slide number using default formatting
				//Reveal.configure({ slideNumber: true });
				//slideNumber: true,

				//if true (default), fragments are generated on separate pages of pdfs
				//pdfSeparateFragments: false,

				// Slide number formatting can be configured using these variables:
				//  "h.v":  horizontal . vertical slide number (default)
				//  "h/v":  horizontal / vertical slide number
				//    "c":  flattened slide number
				//  "c/t":  flattened slide number / total slides
				//Reveal.configure({ slideNumber: 'c' });
				slideNumber: 'c',

				// Parallax scrolling
				// parallaxBackgroundImage: 'https://s3.amazonaws.com/hakim-static/reveal-js/reveal-parallax-1.jpg',
				// parallaxBackgroundSize: '2100px 900px',

				// Optional reveal.js plugins
				// More info https://github.com/hakimel/reveal.js#dependencies
				/***
				dependencies: [
					{ src: './reveal.js/plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
					{ src: './reveal.js/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
					{ src: './reveal.js/plugin/highlight/highlight.js' },
					{ src: './reveal.js/plugin/search/search.js', async: true },
					{ src: './reveal.js/plugin/zoom-js/zoom.js', async: true },
					{ src: './reveal.js/plugin/notes/notes.js', async: true }
				]
				***/
				// Learn about plugins: https://revealjs.com/plugins/
				plugins: [ RevealMarkdown, RevealZoom, RevealHighlight, RevealNotes ]
			});

		</script>

	</body>
</html>