-
Notifications
You must be signed in to change notification settings - Fork 0
/
BlogNoSql.html
671 lines (663 loc) · 41.5 KB
/
BlogNoSql.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
<!DOCTYPE html>
<html>
<!--
BlogParser.html
-->
<head>
<title>Blog Parser</title>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<!-- <link rel="icon" type="image/x-icon" href="./images/favicon.ico" /> -->
<link rel="stylesheet" href="css/StylesPhoto.css" />
<link rel="stylesheet" href="css/StylesSizerComp.css" />
<!-- PageFrame infrastructure -->
<link rel="stylesheet" href="css/StylesPageFrameDefaults.css" />
<link rel="stylesheet" href="css/StylesPageFrameStructure.css" />
<link rel="stylesheet" href="css/StylesPageFrameMenus.css" />
<link rel="stylesheet" href="css/StylesPageFrameThemePython.css" />
<link rel="stylesheet" href="css/StylesWebComponents.css" />
<script src="js/ScriptsWebComponents.js"></script>
<!--<script src="js/ScriptsPageFrameDefaults.js"></script>-->
<script src="js/ScriptsPageFramePosts.js"></script>
<script src="js/ScriptsPageFramePagesPosts.js"></script>
<script src="js/ScriptsPageFrameKeyboard.js"></script>
<script src="js/ScriptsSizerComp.js"></script>
<!-- No need for Pages script for pages with no next or prev pages -->
<!--<script src="js/ScriptsPageFramePages.js"></script>-->
<!-- <script src="js/ScriptsTemplate.js"></script>
<link rel="stylesheet" href="css/StylesTemplate.css" /> -->
<style>
h3 {
margin-top: 1.5em;
}
#subtitle {
margin-top: 0.4em;
margin-bottom: 0.3em;
}
#github header summary {
border: 1px solid var(--light);
}
#github summary {
padding-right: 2em;
}
/* #github .menuHead {
margin:0em -0.25em 0.0em -0.25em;
padding:0.25em 0.5em;
} */
</style>
<script>
function load() {
initialize();
//loadif();
}
</script>
<style>
#github note {
display: block;
width:max-content;
border:1px solid red;
padding:0.5em 1.0em;
margin:0.5em 0em;
}
#github .bargraph {
border: 1px solid var(--dark);
/* background-color: #bbb; */
padding: 0.1em 0.5em;
font-size:0.9em;
}
#github table {
border:2px solid var(--dark);
}
#github table td {
padding:0.25em 1.0em;
border:none;
}
body {
user-select:none;
}
</style>
<script>
function clickstat() {
// prevent parent click event handling
event.stopImmediatePropagation();
}
</script>
</head>
<body id="github" onload="load()" style="position:relative;">
<a id="Next" href="BlogParser.html">Next</a>
<a id="Prev" href="BlogActiveObjects.html">Prev</a>
<page-frame>
<frame-header>
<nav id="navbar"></nav>
</frame-header>
<main id="main">
<div id="about" onclick="this.style.display = 'none'">about</div>
<div id="page">Blog: NoSql</div>
<div id="modified">11/27/2024</div>
<div id="hlp"></div>
<a id="top"></a>
<content style="height:100vh; position:relative;">
<header style="cursor:pointer;" onclick="loadif()">
<!-- <a target="_blank" class="repoLink" href="https://github.com/JimFawcett">github Repositories</a> -->
<hgroup id="pagetitle" style="border: 2px solid var(--dark);">
<h1 id="title">Blog: NoSql</h1>
<h3 class="indent" id="subtitle">
data models, uses, structure, activities, queries
</h3>
</hgroup>
<!-- <img style="width:100%; margin:-0.1em 0em; border:2px solid var(--dark); padding:0.5em; background-color:var(--light);" src="Pictures/officestrip3a.svg" /> -->
<div class="darkItem" onclick="loadif()" style="cursor:pointer; position:relative; padding:0.0em 0em 0.25em 0em; margin-top:-0.50em; border:2px solid var(--dark);">
<a class="repoLinks" target="_blank" href="https://github.com/JimFawcett" style="color:var(--atten); margin-left:1.5em;">About</a>
<div style="font-size:0.9em; position:absolute; top:0.1em; right:1.5em;">click to toggle Site Explorer</div>
<div style="height:0.5em;"></div>
</div>
</header>
<h2>Initial Thoughts:</h2>
<t-b>
There is currently a lot of technical interest in <a target="_blank" href="https://en.wikipedia.org/wiki/Big_data">"Big Data"</a>.
Extreme examples are: data collection and analyses from the <a target="_blank" href="https://en.wikipedia.org/wiki/Large_Hadron_Collider">Large Hadron Collider</a>,
the <a target="_blank" href="http://www.sdss.org/">Sloan Sky Survey</a>, analyses of Biological <a target="_blank" href="http://www.genome.jp/kegg/kegg1a.html">Genomes</a>,
collecting data for <a target="_blank" href="https://en.wikipedia.org/wiki/General_Circulation_Model">global climate models</a>, and
analyzing client interactions in <a target="_blank" href="https://en.wikipedia.org/wiki/Social_network_analysis">social networks</a>.
</t-b>
<t-b>
Conventional SQL databases may not be well suited for these kinds of applications. While they have worked very well for many
business applications and record keeping, they get overwhelmed by massive streams of data.
Developers are turning to <a target="_blank" href="https://en.wikipedia.org/wiki/NoSQL">"noSQL" databases</a>
like <a target="_blank" href="https://www.mongodb.org/">MongoDB</a>, <a target="_blank" href="http://couchdb.apache.org/">CouchDB</a>,
and <a target="_blank" href="http://redis.io/">Redis</a> to handle massive data collection and analyses.
</t-b>
<h3 id="sdm">SQL Data Model:</h3>
<t-b>
Traditional SQL databases provide a very well understood data management model that supports the <a target="_blank" href="https://en.wikipedia.org/wiki/ACID">ACID properties</a>,
e.g., each transaction is <strong>A</strong>tomic, leaves managed data in a <strong>C</strong>onsistent state, appears to operate in <strong>I</strong>solation from other
transactions that may operate concurrently, and at the end of the transaction the database state is <strong>D</strong>urable, e.g, is persisted to a permanent
store.
</t-b>
<t-b>
SQL data is normalized into tables with relationships. This matches very well with data models where many records may be associated with the same data.
If we build a books database, for example, many books may be associated with the same publisher information. We link the book information with a foreign key
relationship to publisher information in another table to avoid duplicating the same publisher data in every book record. Many to many relationships are
modeled by linking tables often containing two foreign keys. For the books database a book may have several authors and an author may have published more
than one book. So the link table holds records each of which capture the association of a book with an author. If a book has two authors there are two
records with that book key, one for each author.
</t-b>
<t-b>
Each SQL Table has a fixed schema that captures the type of the records in the table. A record in the books table
might contain the book's name and date of publication.
SQL database designs emphasize data integrity and structuring models in a fixed normalized tabular form.
Queries into the database usually join data from
several tables to build a complete description of the results to be returned.
</t-b>
<h3 id="nsdm">noSQL Data Models:</h3>
<t-b>
The data models used by noSQL databases are usually based on key/value pairs, document stores, or networks. noSQL processing
favors modeling flexibility, the ability to easily scale out across multiple machines, and performance with very large datasets.
For that flexibility they give up real-time data consistency, accepting application enforced eventual consistency. They give up
a formal query mechanism (hence the name). And, they may give up Durability guarantees by only occasionally writing to persistant
storage in order to provide high throughput with large volumes of data.
</t-b>
<t-b>
The choice to use <a target="_blank" href="http://www.paperplanes.de/2010/7/5/relational_data_document_databases_schema_design.html">SQL or noSQL</a>
data management is driven by the nature of its uses. Below we discuss <a target="_blank" href="../CSE681/lectures/Project5-F2015.htm">Project #5</a>,
an application that builds a data management service for a large collaboration system composed of federated servers. That seems
ideally suited for noSQL data managment.
</t-b>
<h3 id="goals">Goals of a noSQL Implementation:</h3>
<t-b>
The noSQL model has goals that often prove to be difficult to implement with SQL databases. A noSQL database is designed to support one or more
of the following:
<ul>
<li>Very large collections of data</li>
<li>High throughput with data from streams</li>
<li>support tree or graph models for its data</li>
<li>support heterogenious collections of data</li>
</ul>
When repeated data isn't a concern, we may avoid the overhead associated with following query references through potentially
many tables and persisting every transaction to a durable store by using a network or key/value reference mechanism in
conjunction with mostly in-memory storage using only occasional writes to the file system. However, when dealing with very large
data models these writes will likely be <a target="_blank" href="https://msdn.microsoft.com/en-us/library/azure/dn764982.aspx">sharded</a>
into many files for durable storage. Probably a few shards, the most recently used, will be held in memory.
</t-b>
<t-b>
A noSQL model may use a hashtable to store key/value pairs incurring essentially constant time lookup and retrieval of its data, e.g.,
time independent of the size of the data. However, when the size of the managed data requires sharding, the
constant time lookup and retrievel may be compromised by processing necessary to locate shards that contain the data we
need to retrieve. We need to think about things like managing multiple shards in memory using a Least Recently Used
mapping strategy, much like a virtual memory system. We will likely think about using in-memory indexes to keep track of which
shards hold specific data items or categories of items. For some applications it may be appropriate to shard data into time-related
batches, e.g., data collected in a day or a week.
</t-b>
<t-b>
With SQL data management all data is managed the same way. The only flexibility is how we partition the data into tables and
possibly shard data across multiple machines. Changing the schemas and sharding strategy can be quite difficult to implement.
Using noSQL databases we have a lot more flexibility in configuring data and it is easier to change schemas.
</t-b>
<t-b>
The good news is that configuring data, managing schemas, determining when and how to persist to durable storage, and maintaining
consistancy is, with noSQL, up to the application. The bad news is that it is up to the application.
</t-b>
<h2 id="implement">Implementing a noSQL Database:</h2>
<t-b>
In <a target="_blank" href="CSE681.htm">CSE681 - Software Modeling & Analysis</a>, Fall 2015, we are exploring the development of a noSQL
database in a series of five projects:
<ul>
<li>
<a target="_blank" href="../CSE681/lectures/Project1-F2015.htm">Project #1</a><br />
Develop the concept for a basic noSQL application. We capture the concept with an "Operational Concept Document" (OCD).
</li>
<li>
<a target="_blank" href="../CSE681/lectures/Project2-F2015.htm">Project #2</a><br />
Implement most of the concept and perform thorough functional tests.
</li>
<li>
<a target="_blank" href="../CSE681/lectures/Project3-F2015.htm">Project #3</a><br />
Develop the concept for a remote noSQL application, based on Project #2, using
a message-passing communication service.
</li>
<li>
<a target="_blank" href="../CSE681/lectures/Project4-F2015.htm">Project #4</a><br />
Implement the remote noSQL database server and do performance testing.
</li>
<li>
<a target="_blank" href="../CSE681/lectures/Project5-F2015.htm">Project #5</a><br />
Create and document a data management service architecture using the ideas developed in the first four projects.
This service will provide the communication and state management infrastructure for a large Software Development
Collaboration System composed of a federation of cooperating servers and client controllers.
</li>
</ul>
Our goals are to understand why noSQL databases are interesting and useful, how they could be built, and
to think about the consequences of this approach. The concepts, developed in Projects #1 and #3, are expressed
in Operational Concept Documents that focus on users and uses, top-level application structure, and critical issues.
</t-b>
<t-b>
Documenting critical issues helps us think critically about our ideas and planned implementation before committing
to code. We may find that biasing our design in one direction or another may support the spinning off of new applications
and services from a solid base. We might also find that there are significant impediments on the path we are embarking
and force a rethinking of the application and its goals.
</t-b>
<h3 id="uses">Concept -> Uses:</h3>
<t-b>
In the projects for this course, we will be concerned with storing very large data sets, accepting data from streams quickly,
storing and accessing networks of data, and managing collections of heterogeneous data.
</t-b>
<t-b>
In the final project this Fall we will investigate the feasibility of building a data management service for a large collaboration
system. That involves: managing a large repository's data, recording continuous integration and test activities,
managing notifications to a large collection of clients, and building and maintaining templates for test configurations,
collaboration sessions, work package descriptions, etc.
</t-b>
<t-b>
For the first project, however, uses focus on understanding requirements needed to implement a noSQL database, exploring alternative
structures, and demonstrating the implications of our design choices. The users are the developer, Teaching Assistants, and the Instructor.
Essentially each student developer is responsible for demonstrating that each of the requirements in the
<a target="_blank" href="Project2-F2015.htm">Project 2</a> statement have been met.
</t-b>
<t-b>
The design impact of this use is that the implementation must carefully demonstrate requirements in a step-by-step
fashion. When a requirement asks for the ability to change some aspect of the database state it is the design's responsibility
to show the state before, display the nature of the change, and display the database state after the change. This should be done
trying to make the display as economical as practical so limiting what an observer must understand to verify the action.
</t-b>
<h3 id="structure">Concept -> Structure:</h3>
<t-b>
Perhaps the easiest way to begin creating a structure for an application we're developing is to think about the tasks it must
execute. The project statement for <a target="_blank" href="../cse681/lectures/Project2-F2015.htm">Project #2</a> requires the noSQL prototype to provide the
capability to:
<ul>
<li>Create items described by metadata and holding an instance of some generic type.</li>
<li>Create and Manage a Key/Value database with capability to store and delete Key/Value<sup><a target="_blank" href="#footnote">1</a></sup> pairs.</li>
<li>Edit Values</li>
<li>Persist database contents to an XML file<sup><a target="_blank" href="#footnote">2</a></sup>.</li>
<li>Augment database contents from an XML file with the same format as persisted, above.</li>
<li>Support a variety of queries, both simple and compound.</li>
<li>Support demonstration of all functional requirements through a series of discrete tests with display to the console.</li>
</ul>
</t-b>
<t-b>
Each database Value has structured meta-data and an instance of the generic Data type. We will choose to create a C# class to represent
Values that might look something like this:
</t-b>
<t-b>
<pre>
public class Value<Key,Data>
{
// public methods providing
// access to private data
private string name; // Note: you may choose to capture
private DateTime timeStamp; // these Value states as properties
private string description; // rather than private data items.
private List<Key> children;
private Data payload;
}
</pre>
Here, the payload is what we really want to save in the database. Metadata is just information about the payload
that helps us find what we want with a query.
</t-b>
<t-b>
We will also define a C# class that represents the database engine:
<pre>
public class noSQLdb<Key,Value>
{
// public methods providing database API
private Dictionary<Key,Value> // The <a target="_blank" href="https://msdn.microsoft.com/en-us/library/xfhwa508(v=vs.110).aspx">dictionary</a> should not be a public property.
}
</pre>
</t-b>
<t-b>
Each task in the list at the top of this section is a candidate to become a package. Some we may decide to merge later.
There may also be times to take an existing package and divide into smaller packages. Usually that happens when the
original was becoming too complicated to test easily. Finally there may be a very few packages that we didn't have
the foresight to define in the concept, but discover a need for during implementation.
</t-b>
<div style="float:left; margin:20px 30px 20px 20px; border:2px solid gray; padding:5px; text-align:center; box-shadow:5px 5px 2px #888;">
<img src="Pictures/PackageDiagramPr2F15.jpg" height="500" />
</div>
<div style="min-width:300px;">
<t-b>
We start with a TestExec package at the top that is responsible for the project's main use - demonstrating that requirements
have all been met.
</t-b>
<t-b>
TextExec creates instances of Key/Value pairs using a simple factory that may generate a unique key and
construct a Value with supplied parameters.
</t-b>
<t-b>
It uses those pairs to populate its noSQLdb instance through an API provided by
the DBEngine package.
</t-b>
<t-b>
The nature of query processing and sharding are the most interesting parts of this project and will be
left to students to work out in their individual ways.
</t-b>
<t-b>
The remaining parts are self-explanitory after reading the
<a target="_blank" href="../CSE681/lectures/Project2-F2015.htm">Project Statement</a>.
</t-b>
<t-b>
When an application is large or becomes complex we often provide a top-level package diagram, like this one,
and later provide more package diagrams for individual parts with significant internal structure.
</t-b>
<t-b>
We almost always provide activity diagrams to help OCD readers understand the intent of the concept.
The OCD for this project would greatly benefit from activity diagrams for handling queries and for
sharding. These are left for students to provide.
</t-b>
</div>
<div style="clear:both;"></div>
<h3 id="critissues">Concept -> Critical Issues:</h3>
<t-b>
<ol>
<li>
<strong>Issue:</strong> - Demonstrating Requirements<br />
Students only get credit for requirements they clearly demonstrate. No inputs other than a supplied
XML file to load the intial database are required<sup><a target="_blank" href="#footnote">3</a></sup>. The only output required is a console display.<br />
<strong>Solution:</strong><br />
This requires careful orchestration of a series of tests invoked by the test executive and supported by processing
in the Display package.
<br />
<strong>Impact on Design:</strong><br />
It will be effective
to provide a method for each test that announces the Requirement number and displays db state before and after
each change.
</li>
<li>
<strong>Issue:</strong> - Designing Queries<br />
Statement and solution(s) are left to the students.
</li>
<li>
<strong>Issue:</strong> - Sharding<br />
Statement and solution(s) are left to the students.
</li>
<li>
<strong>More Issues:</strong> - Left to Students.
</li>
</ol>
</t-b>
<h3>Later Projects:</h3>
<t-b>
After completing Project #2 we work on a concept, in <a target="_blank" href="../CSE681/Lectures/Project3-F2015.htm">Project #3</a>, and implement, in
<a target="_blank" href="../CSE681/Lectures/Project4-F2015.htm">Project #4</a> remote access to the noSQL prototype via message-passing communication.
</t-b>
<t-b>
Finally we develop an architecture, in <a target="_blank" href="../CSE681/Lectures/Project5-F2015.htm">Project #5</a>, for a data management service
in a large Software Development Collaboration Environment using the NoSQL model we created in the earlier projects.
</t-b>
<t-b>
You will find that several noSQL databases are required for Project #5 and that the key types and value types will not all be the same. I would
expect that sharding strategies may vary from database to database. For that reason, it would be interesting to support
pluggable sharding strategies in our noSQL design. You should probably address that as a critical issue in your OCD for
Project #1<sup><a target="_blank" href="#footnote">4</a></sup>.
</t-b>
<h2>Concept Revisited:</h2>
<t-b>
All the discussion that follows was added after students turned in their noSQL Operational Concept Documents. This discussion is concerned
with things I wanted students to think about without being given too much guidance, but now want to clarify before they begin their designs for
the noSQL Database. We will focus on meeting Requirements, Queries, Sharding, and say a couple of things about the ItemFactory. To understand the details
discussed here you will find it helpful to look again at the <a target="_blank" href="../CSE681/Code/Project2HelpF15">Project #2 starter code</a> discussed
in class.
</t-b>
<h3 id="toplevelact">Top Level Activities:</h3>
<t-b>
In Project #5 our architectue describes data services for a federation of servers designed to support collaboration activities in a software development environment.
In Project #2, we're providing the noSQL database to support those services.
</t-b>
<div style="float:right; margin:20px 30px 20px 20px; border:2px solid grey; padding:15px; text-align:center; box-shadow:5px 5px 2px #888;">
<img src="Pictures/RequirementsActivityDiagram.png" width="480" />
<div style="text-align:center; padding-top:10px;">Top Level Activity Diagram</div>
</div>
But for Project #2 all activities focus on demonstrating requirements. To do this we:
<div>
<ul>
<li>
Demonstrate creation of each of the required parts, DBEngine<Key, Value>, DBElement<Key, Data>
</li>
<li>
Demonstrate adding, removing, and editing DBElements<Key, Value> while contained in the database.
</li>
<li>
Each time we make a required change in the database's state we are obligated to show the state before
the change, the changed DBElement, and the state of the database after the change.
</li>
<li>
Write the database contents to an XML file, clear the database contents, add a new element or two,
then read the XML and agument the existing database contents from the XML files contents.
</li>
<li>
Clear the database contents and load the database from and XML file describing your project's structure.
</li>
</ul>
In the help session on Friday Morning, 9/18/2015 we started the construction of a TestExec that implements
these activities. You will find that code here: <a target="_blank" href="../CSE681/code/Project2HelpF15">Project2HelpF15</a>
in the Project2Starter folder.
</div>
<h3 id="queries" style="clear:both;">Queries:</h3>
<t-b>
First, what is a query for this nonSQL database? Let's define that in parts:
<ul>
<li>
A <strong>QueryPredicate</strong> is a function that accepts a db key and returns true or false depending upon the processing of the
predicate function. For this noSQL db, the processing will look for specific conditions in the element bound to the supplied key,
e.g., name, description, time-date stamp, children, or payload. We will use C# lambdas to implement QueryPredicates. See
<a target="_blank" href="../Coretechnologies/CSharp/code/LambdaDemo">LambdaDemo</a> for code that is close to what we need here.
</li>
<li>
A <strong>simple query</strong> then, consists of applying the QueryPredicate to each of the keys in the database and collecting all of the
keys for which the predicate is true. That's what DBFactory<Key, Value> does.
</li>
<li>
A compound query is a chain of queries, each query using the keyset returned by the previous query<sup><a target="_blank" href="#footnote">5</a></sup>. We'll need some
way to manage those returned keysets - QueryEngine manages that process, using DBFactory<Key, Value>s to wrap return keysets.
At each step QueryEngine constructs a DBFactory<Key, Value> instance, applies its next QueryPredicate to that instance, to
get the next keyset.
</li>
</ul>
</t-b>
<div style="float:left; margin:20px 30px 20px 20px; border:2px solid grey; padding:15px; text-align:center; box-shadow:5px 5px 2px #888;">
<img src="Pictures/noSQLquery.png" width="480" />
<div style="text-align:center; padding-top:10px;">Query Processing Class Diagram</div>
</div>
<t-b>
So, QueryEngine wraps each query return in an object that holds the resulting keyset and has the same reading interface (keys(), getValue(key, out val))
as the DBEngine but doesn't have any writing methods. That's DBFactory<Key, Value>.
</t-b>
<t-b>
Suppose that we define a C# interface, IQuery, that declares
those "reading" methods and have both DBFactory and DBEngine implement that interface.
Then to make a query the query engine applies a QueryPredicate to DBEngine<Key, Value>, creates a DBFactory<Key, Value>
passing to its constructor the ketset returned by the query.
</t-b>
<t-b>
If the query is compound this process is repeaded using the DBFactory
at each successive step. Thus the keyset gets refined at each step of the compound query. Essentially the DBFactory is just a
container of keys that link back to the DBEngine through a reference held by the DBFactory.
</t-b>
<t-b>
With that setup we can define the simpleQuery method to use the IQuery interface so it can be applied at each step of the query, e.g.,
to either DBEngine<Key, Value> or to DBFactory<Key, value>.
</t-b>
<div style="float:left; margin:20px 30px 20px 20px; border:2px solid grey; padding:15px; text-align:center; box-shadow:5px 5px 2px #888;">
<img src="Pictures/Project2-Activity.png" width="480" />
<div style="text-align:center; padding-top:10px;">Query Processing Activity Diagram</div>
</div>
<t-b>
QueryEngine is configured with a set of QueryPredicates. It uses the first on DBEngine<Key, Value> to get a DBFactory<Key, Value>
typed as an IQuery<Key, Value> with the first keyset, and uses each
successive query on the returned DBFactory<Key, Value> to refine the keyset returned by the previous query. You will find
the PredicateLambda project in <a target="_blank" href="../CoreTechnologies/CSharp/Code/LambdaDemo">LambdaDemo</a> to contain code that is close to
what you need for Project #2.
</t-b>
<t-b>
You can think of the QueryPredicates to be equivalent to stored procedures in a conventional database.
</t-b>
<h3>Sharding:</h3>
<t-b>
Sharding is the process of writing out part of the contents of the noSQL Database to files - we'll use XML processing to do that.
So how do we decide what part to write out for each shard? Here, the designer has to understand the data she's working with.
</t-b>
<t-b>
In <a target="_blank" href="../CSE681/Lectures/Project5-F2015.htm">Project #5</a> we're building a data management service for a federation of
servers that implement a Software Development Collaboration Facility. One of those servers is a Software Repository. We'll use
a noSQL database to manage files in terms of their dependencies, e.g., which packages depend on which others. So each element
of the database represents a package. The parent - child relationships supported by a noSQL database are just what we need to
represent dependencies between packages. Here, we'll probably use the package name as the key
for each element of the database.
</t-b>
<t-b>
So, we'll shard that database by subsystem. All the packages in one subsystem are described by one shard. Note that we don't save
the packages in the database. We just save their locations and dependencies. The file system does a beautiful job of holding onto
the physical files.
</t-b>
<t-b>
Similarly, another server is a test harness. It generates perhaps hundreds of test events per second, e.g., which test is executing,
did it pass or fail, ... For this we'll probably shard based on time, e.g., all the messages that occurred today.
</t-b>
<h4 id="sharding">Sharding Critical Issues:</h4>
There are two obvious critical issues for sharding:
<ul>
<li>
How to select the contents of individual shards. The only likely solution is to know your data (see the comments above).
</li>
<li>
How to efficiently query information distributed across shards as well as what's in memory. One solution for that is to use another
noSQL database to manage sharding. If each element of that database represents a shard, then we can describe the shard
with the descriptor, and supply more details in the payload. So a query into the shards uses the shard manager db to find
the appropriate, hopefully small number of, shards to load into memory and query.
</li>
</ul>
<h3>ItemFactory:</h3>
<t-b>
The starter code uses a DBElement<Key, Data> to ensure that all the code that uses the noSQL database have elements that
are structured in the same way. ItemFactory is intended to support the creation of database elements where there are many very
similar instances to create. Logging for the TestHarness in <a target="_blank" href="../CSE681/Lectures/Project5-F2015.htm">Project #5</a>
is an example. There, the items will have identical, or nearly identical descriptors, have auto-generated names, and probably
have text payloads. The ItemFactory could use XML templates to configure items for these special situations so that client code
won't have to provide that functionality. It should use DBElement<Key, Data> for all the heavy lifting.
</t-b>
<t-b>
The ItemEditor is responsible for supporting revisions to any part of a DBElement<Key, Value>, e.g., name, descriptor, time-date
stamp, child relationships, and payload. It too should use DBElement<Key, Data> to manage the all the element parts, and just
provide the functionality for change. Most of this is quite straight-forward, but there is one interesting issue: how to support
editing the payload when we don't know anything about its structure. For that we could make the ItemEditor class generic on the
payload Data type and ask the user to supply editing facilities for that. How to do that is an interesting question to think about.
</t-b>
<hr />
<a name="footnote" />
<ol class="footnote">
<li>
The C# language has two kinds of types: value types and reference types. Value types reside in static or stack memory, are
copyable, and when assigned are unique from the original source. Reference types reside in managed memory and are, in general,
not copyable nor assignable. The program's code may copy or assign a reference to an instance on the managed heap, but
both target and source of the reference copy or assignment are the same heap-based instance. Our use of the term Value
in this blog does not mean a C# value type. It simply means the database data referenced by the key. The kind of it's type may be
either a C# value or C# reference type.
</li>
<li>
Project #1 encourages students to think about issues like sharding. We do not require students to implement sharding in
Project #2 but would be pleased to see and review any sharding processes they may attempt.
</li>
<li>
Please do not provide console menues. A GUI could be effective for Project #2 but I would much rather have you spending
your time working on the functional requirements.
</li>
<li>
This is a test to see if you've read the entire blog carefully before submitting your first project.
</li>
<li>
The compound query discussed above is composed of a collection of clauses (conditions) joined with "and"s, e.g.,
ConditionA and ConditionB and ... must be true to satisfy the query. Of course we may have queries joined by "or"s
or some mix. For this project we will only require "and"ing clauses as that is the most common.
</li>
</ol>
<h3><a target="_blank" href="../CSE681/Lectures/Project2-F2015.htm">Project #2</a> Take-aways:</h3>
<div class="notecallout">
<t-b>
A few of the ideas discussed here are prototyped in:
<ul>
<li>
<a target="_blank" href="../CSE681/Code/Project2HelpF15">Project #2 starter code</a><br />
Provides starter code for DBEngine<Key, Value>, DBElement<Key, Data>, and associated display code. After
the help session, Friday 9/18/2015, it also contains a TestExec that begins to demonstrate requirements.
</li>
<li>
<a target="_blank" href="../CoreTechnologies/CSharp/Code/LambdaDemo">LambdaDemo</a><br />
Contains three projects that dig into delegate-lambda relationships and of particular value
a project named PredicateLambda which provides starter code for the QueryEngine
and DBFactory<Key, Value>.
</li>
</ul>
Obviously there is still a lot of design and implementation left for you to enjoy before October 7th.
</t-b>
<t-b>
For those of you without a lot of software development experience, you should note that:
<ul>
<li>There is no requirement for sharding. You just have to write the db contents to and from an XML file.</li>
<li>
There is no requirement for compound queries. If you don't make compound queries then QueryEngine<Key, Value> just needs
one queryPredicate for a query and you don't need to build the DBFactory<Key, Value>, you just do its processing one time
in QueryEngine<Key, Value>.doQuery().
</li>
<li>
You won't penalized for ommitting features you discussed in the concept but were not required by the
Project #2 statement.
</li>
<li>
Your code must have a well-thought out structure and use well-formed packages, but it doesn't need to
look just like the concepts discussed here.
</li>
</ul>
</t-b>
</div>
<div style="height:1.0em;"></div>
<t-b>
<img class="photo" src="Pictures/carnagieStrip.jpg" alt="Newhouse" style="width:calc(100vw - 9em);" />
</t-b>
<div style="height:1.0em;"></div>
<a id="bottom"></a>
</content>
<page-TOC id="pages" style="display:none;">
</page-TOC>
<page-sections id="sections" style="display:none;">
<menu-elem style="width:0.0em"> </menu-elem>
<menu-elem class="secElem"><a href="#bottom">bottom</a></menu-elem>
<menu-elem class="secElem"><a href="#sharding">sharding</a></menu-elem>
<menu-elem class="secElem"><a href="#queries">queries</a></menu-elem>
<menu-elem class="secElem"><a href="#toplevelact">top level act</a></menu-elem>
<menu-elem class="secElem"><a href="#critissues">critical issues</a></menu-elem>
<menu-elem class="secElem"><a href="#structure">structure</a></menu-elem>
<menu-elem class="secElem"><a href="#uses">uses</a></menu-elem>
<menu-elem class="secElem"><a href="#implement">implement</a></menu-elem>
<menu-elem class="secElem"><a href="#goals">goals</a></menu-elem>
<menu-elem class="secElem"><a href="#nsdm">NoSQL data model</a></menu-elem>
<menu-elem class="secElem"><a href="#sdm">SQL data model</a></menu-elem>
<menu-elem class="secElem"><a href="#top">top</a></menu-elem>
<div class='darkItem popupHeader' style="padding:0.25em 2.0em;" onclick="this.parentElement.style.display='none'">Sections</div>
</page-sections>
</main>
<frame-footer>
<menu-item style="width:2.0em;"> </menu-item>
<menu-elem id="nextLink2" onclick="bottomMenu.next()">Next</menu-elem>
<menu-elem id="prevLink2" onclick="bottomMenu.prev()">Prev</menu-elem>
<menu-elem id="pgbtn" onclick="bottomMenu.pages()">Pages</menu-elem>
<menu-elem onclick="bottomMenu.sections()">Sections</menu-elem>
<menu-elem onclick="bottomMenu.about()">About</menu-elem>
<menu-elem id="kysbtn" onclick="storyHlpMenu.keys()">Keys</menu-elem>
<menu-elem style="margin-right:1em">
<span id="loc" style="display:inline-block; font-weight:normal"></span>
</menu-elem>
</frame-footer>
</page-frame>
<script>
createSizer("Pictures/Communicators.png", "Fig 1. MPC with Active Communicators", 400, "fig1");
createSizer("Pictures/CommObjs.png", "Fig 2. Document Vault using Message Dispatching in Server", 500, "fig2");
createSizer("Pictures/CommunicatorActivities.png", "Fig 3. Client and Server Activities", 400, "fig3");
createSizer("Pictures/CommunicatorPackages.png", "Fig 4. Client and Server Packages", 400, "fig4");
createSizer("Pictures/MsgPassFramework.png", "Fig 5. Document Vault Demo Logical Structure", 400, "fig5");
createSizer("Pictures/CommunicatorOutput.png", "Fig 6. Document Vault Demo Output", 400, "fig6");
</script>
<script>
let loc = document.getElementById("loc");
let fn = window.location.href.split(/\/|\\/).pop();
loc.innerHTML = fn + ":";
</script>
</body>
</html>